/******************** -*- mode: C; tab-width: 8 -*- ********************
*
*	Run-time assembler for IA-32 and AMD64
*
***********************************************************************/


/***********************************************************************
*
*  This file is derived from CCG.
*
*  Copyright 1999, 2000, 2001, 2002, 2003 Ian Piumarta
*
*  Adaptations and enhancements for AMD64 support, Copyright 2003-2008
*    Gwenole Beauchesne
*
*  Basilisk II (C) 1997-2008 Christian Bauer
*
*  This program is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 2 of the License, or
*  (at your option) any later version.
*
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
***********************************************************************/

#ifndef X86_RTASM_H
#define X86_RTASM_H

/* NOTES
 *
 *	o Best viewed on a 1024x768 screen with fixed-6x10 font ;-)
 *
 * TODO
 *
 *	o Fix FIXMEs
 *	o i387 FPU instructions
 *	o SSE instructions
 *	o Optimize for cases where register numbers are not integral constants
 */

/* --- Configuration ------------------------------------------------------- */

/* Define to settle a "flat" register set, i.e. different regno for
   each size variant.  */
#ifndef X86_FLAT_REGISTERS
    #define X86_FLAT_REGISTERS 1
#endif

/* Define to generate x86-64 code.  */
#ifndef X86_TARGET_64BIT
    #define X86_TARGET_64BIT 0
#endif

/* Define to optimize ALU instructions.  */
#ifndef X86_OPTIMIZE_ALU
    #define X86_OPTIMIZE_ALU 1
#endif

/* Define to optimize rotate/shift instructions.  */
#ifndef X86_OPTIMIZE_ROTSHI
    #define X86_OPTIMIZE_ROTSHI 1
#endif

/* Define to optimize absolute addresses for RIP relative addressing.  */
#ifndef X86_RIP_RELATIVE_ADDR
    #define X86_RIP_RELATIVE_ADDR 1
#endif


/* --- Macros -------------------------------------------------------------- */

/* Functions used to emit code.
 *
 *	x86_emit_byte(B)
 *	x86_emit_word(W)
 *	x86_emit_long(L)
 */

/* Get pointer to current code
 *
 *	x86_get_target()
 */

/* Abort assembler, fatal failure.
 *
 *	x86_emit_failure(MSG)
 */

#define x86_emit_failure0(MSG) (x86_emit_failure(MSG), 0)


/* --- Register set -------------------------------------------------------- */

enum
{
    X86_RIP = -2,
    #if X86_FLAT_REGISTERS
    X86_NOREG = 0,
    X86_Reg8L_Base = 0x10,
    X86_Reg8H_Base = 0x20,
    X86_Reg16_Base = 0x30,
    X86_Reg32_Base = 0x40,
    X86_Reg64_Base = 0x50,
    X86_RegMMX_Base = 0x60,
    X86_RegXMM_Base = 0x70,
    #else
    X86_NOREG = -1,
    X86_Reg8L_Base = 0,
    X86_Reg8H_Base = 16,
    X86_Reg16_Base = 0,
    X86_Reg32_Base = 0,
    X86_Reg64_Base = 0,
    X86_RegMMX_Base = 0,
    X86_RegXMM_Base = 0,
    #endif
};

enum
{
    X86_AL = X86_Reg8L_Base,
    X86_CL, X86_DL, X86_BL,
    X86_SPL, X86_BPL, X86_SIL, X86_DIL,
    X86_R8B, X86_R9B, X86_R10B, X86_R11B,
    X86_R12B, X86_R13B, X86_R14B, X86_R15B,
    X86_AH = X86_Reg8H_Base + 4,
    X86_CH, X86_DH, X86_BH
};

enum
{
    X86_AX = X86_Reg16_Base,
    X86_CX, X86_DX, X86_BX,
    X86_SP, X86_BP, X86_SI, X86_DI,
    X86_R8W, X86_R9W, X86_R10W, X86_R11W,
    X86_R12W, X86_R13W, X86_R14W, X86_R15W
};

enum
{
    X86_EAX = X86_Reg32_Base,
    X86_ECX, X86_EDX, X86_EBX,
    X86_ESP, X86_EBP, X86_ESI, X86_EDI,
    X86_R8D, X86_R9D, X86_R10D, X86_R11D,
    X86_R12D, X86_R13D, X86_R14D, X86_R15D
};

enum
{
    X86_RAX = X86_Reg64_Base,
    X86_RCX, X86_RDX, X86_RBX,
    X86_RSP, X86_RBP, X86_RSI, X86_RDI,
    X86_R8, X86_R9, X86_R10, X86_R11,
    X86_R12, X86_R13, X86_R14, X86_R15
};

enum
{
    X86_MM0 = X86_RegMMX_Base,
    X86_MM1, X86_MM2, X86_MM3,
    X86_MM4, X86_MM5, X86_MM6, X86_MM7,
};

enum
{
    X86_XMM0 = X86_RegXMM_Base,
    X86_XMM1, X86_XMM2, X86_XMM3,
    X86_XMM4, X86_XMM5, X86_XMM6, X86_XMM7,
    X86_XMM8, X86_XMM9, X86_XMM10, X86_XMM11,
    X86_XMM12, X86_XMM13, X86_XMM14, X86_XMM15
};

/* Register control and access
 *
 *	_r0P(R)	Null register?
 *	_rIP(R)	RIP register?
 *	_rXP(R)	Extended register?
 *
 *	_rC(R)	Class of register (only valid if X86_FLAT_REGISTERS)
 *	_rR(R)	Full register number
 *	_rN(R)	Short register number for encoding
 *
 *	_r1(R)	8-bit register ID
 *	_r2(R)	16-bit register ID
 *	_r4(R)	32-bit register ID
 *	_r8(R)	64-bit register ID
 *	_rM(R)	MMX register ID
 *	_rX(R)	XMM register ID
 *	_rA(R)	Address register ID used for EA calculation
 */

#define _r0P(R)     ((int)(R) == (int)X86_NOREG)
#define _rIP(R)     (X86_TARGET_64BIT ? ((int)(R) == (int)X86_RIP) : 0)

#if X86_FLAT_REGISTERS
    #define _rC(R)      ((R) & 0xf0)
    #define _rR(R)      ((R) & 0x0f)
    #define _rN(R)      ((R) & 0x07)
    #define _rXP(R)     ((R) > 0 && _rR(R) > 7)
#else
    #define _rN(R)      ((R) & 0x07)
    #define _rR(R)      (int(R))
    #define _rXP(R)     (_rR(R) > 7 && _rR(R) < 16)
#endif

#if !defined (_ASM_SAFETY) || !X86_FLAT_REGISTERS
    #define _r1(R)      _rN(R)
    #define _r2(R)      _rN(R)
    #define _r4(R)      _rN(R)
    #define _r8(R)      _rN(R)
    #define _rA(R)      _rN(R)
    #define _rM(R)      _rN(R)
    #define _rX(R)      _rN(R)
#else
    #define _r1(R)      (((_rC(R) & (X86_Reg8L_Base | X86_Reg8H_Base)) != 0)   ? _rN(R) : x86_emit_failure0("8-bit register required"))
    #define _r2(R)      ((_rC(R) == X86_Reg16_Base)                ? _rN(R) : x86_emit_failure0("16-bit register required"))
    #define _r4(R)      ((_rC(R) == X86_Reg32_Base)                ? _rN(R) : x86_emit_failure0("32-bit register required"))
    #define _r8(R)      ((_rC(R) == X86_Reg64_Base)                ? _rN(R) : x86_emit_failure0("64-bit register required"))
    #define _rA(R)      (X86_TARGET_64BIT ? \
                         ((_rC(R) == X86_Reg64_Base)                ? _rN(R) : x86_emit_failure0("not a valid 64-bit base/index expression")) : \
                         ((_rC(R) == X86_Reg32_Base)                ? _rN(R) : x86_emit_failure0("not a valid 32-bit base/index expression")))
    #define _rM(R)      ((_rC(R) == X86_RegMMX_Base)               ? _rN(R) : x86_emit_failure0("MMX register required"))
    #define _rX(R)      ((_rC(R) == X86_RegXMM_Base)               ? _rN(R) : x86_emit_failure0("SSE register required"))
#endif

#define _rSP()      (X86_TARGET_64BIT ? (int)X86_RSP : (int)X86_ESP)
#define _r1e8lP(R)  (int(R) >= X86_SPL && int(R) <= X86_DIL)
#define _rbpP(R)    (_rR(R) == _rR(X86_RBP))
#define _rspP(R)    (_rR(R) == _rR(X86_RSP))
#define _rbp13P(R)  (_rN(R) == _rN(X86_RBP))
#define _rsp12P(R)  (_rN(R) == _rN(X86_RSP))


/* ========================================================================= */
/* --- UTILITY ------------------------------------------------------------- */
/* ========================================================================= */

typedef signed char _sc;
typedef unsigned char _uc;
typedef signed short _ss;
typedef unsigned short _us;
typedef signed int _sl;
typedef unsigned int _ul;

#define _UC(X)      ((_uc)(unsigned long)(X))
#define _US(X)      ((_us)(unsigned long)(X))
#define _SL(X)      ((_sl)(unsigned long)(X))
#define _UL(X)      ((_ul)(unsigned long)(X))

#define _PUC(X)     ((_uc*)(X))
#define _PUS(X)     ((_us*)(X))
#define _PSL(X)     ((_sl*)(X))
#define _PUL(X)     ((_ul*)(X))

#define _B(B)       x86_emit_byte((B))
#define _W(W)       x86_emit_word((W))
#define _L(L)       x86_emit_long((L))
#define _Q(Q)       x86_emit_quad((Q))

#define _MASK(N)    ((unsigned)((1 << (N))) - 1)
#define _siP(N, I)   (!((((unsigned)(I)) ^ (((unsigned)(I)) << 1)) & ~_MASK (N)))
#define _uiP(N, I)   (!(((unsigned)(I)) & ~_MASK (N)))
#define _suiP(N, I)  (_siP(N, I) | _uiP(N, I))

#ifndef _ASM_SAFETY
    #define _ck_s(W, I)  (_UL(I) & _MASK(W))
    #define _ck_u(W, I)      (_UL(I) & _MASK(W))
    #define _ck_su(W, I)     (_UL(I) & _MASK(W))
    #define _ck_d(W, I)      (_UL(I) & _MASK(W))
#else
    #define _ck_s(W, I)  (_siP(W, I) ? (_UL(I) & _MASK(W)) : x86_emit_failure0("signed integer `" #I "' too large for " #W "-bit field"))
    #define _ck_u(W, I)      (_uiP(W, I) ? (_UL(I) & _MASK(W)) : x86_emit_failure0("unsigned integer `" #I "' too large for " #W "-bit field"))
    #define _ck_su(W, I)     (_suiP(W, I) ? (_UL(I) & _MASK(W)) : x86_emit_failure0("integer `" #I "' too large for " #W "-bit field"))
    #define _ck_d(W, I)      (_siP(W, I) ? (_UL(I) & _MASK(W)) : x86_emit_failure0("displacement `" #I "' too large for " #W "-bit field"))
#endif

#define _s0P(I)     ((I) == 0)
#define _s8P(I)     _siP(8, I)
#define _s16P(I)    _siP(16, I)
#define _u8P(I)     _uiP(8, I)
#define _u16P(I)    _uiP(16, I)

#define _su8(I)     _ck_su(8, I)
#define _su16(I)    _ck_su(16, I)

#define _s1(I)          _ck_s(1, I)
#define _s2(I)          _ck_s(2, I)
#define _s3(I)          _ck_s(3, I)
#define _s4(I)          _ck_s(4, I)
#define _s5(I)          _ck_s(5, I)
#define _s6(I)          _ck_s(6, I)
#define _s7(I)          _ck_s(7, I)
#define _s8(I)          _ck_s(8, I)
#define _s9(I)          _ck_s(9, I)
#define _s10(I)         _ck_s(10, I)
#define _s11(I)         _ck_s(11, I)
#define _s12(I)         _ck_s(12, I)
#define _s13(I)         _ck_s(13, I)
#define _s14(I)         _ck_s(14, I)
#define _s15(I)         _ck_s(15, I)
#define _s16(I)         _ck_s(16, I)
#define _s17(I)         _ck_s(17, I)
#define _s18(I)         _ck_s(18, I)
#define _s19(I)         _ck_s(19, I)
#define _s20(I)         _ck_s(20, I)
#define _s21(I)         _ck_s(21, I)
#define _s22(I)         _ck_s(22, I)
#define _s23(I)         _ck_s(23, I)
#define _s24(I)         _ck_s(24, I)
#define _s25(I)         _ck_s(25, I)
#define _s26(I)         _ck_s(26, I)
#define _s27(I)         _ck_s(27, I)
#define _s28(I)         _ck_s(28, I)
#define _s29(I)         _ck_s(29, I)
#define _s30(I)         _ck_s(30, I)
#define _s31(I)         _ck_s(31, I)
#define _u1(I)          _ck_u(1, I)
#define _u2(I)          _ck_u(2, I)
#define _u3(I)          _ck_u(3, I)
#define _u4(I)          _ck_u(4, I)
#define _u5(I)          _ck_u(5, I)
#define _u6(I)          _ck_u(6, I)
#define _u7(I)          _ck_u(7, I)
#define _u8(I)          _ck_u(8, I)
#define _u9(I)          _ck_u(9, I)
#define _u10(I)         _ck_u(10, I)
#define _u11(I)         _ck_u(11, I)
#define _u12(I)         _ck_u(12, I)
#define _u13(I)         _ck_u(13, I)
#define _u14(I)         _ck_u(14, I)
#define _u15(I)         _ck_u(15, I)
#define _u16(I)         _ck_u(16, I)
#define _u17(I)         _ck_u(17, I)
#define _u18(I)         _ck_u(18, I)
#define _u19(I)         _ck_u(19, I)
#define _u20(I)         _ck_u(20, I)
#define _u21(I)         _ck_u(21, I)
#define _u22(I)         _ck_u(22, I)
#define _u23(I)         _ck_u(23, I)
#define _u24(I)         _ck_u(24, I)
#define _u25(I)         _ck_u(25, I)
#define _u26(I)         _ck_u(26, I)
#define _u27(I)         _ck_u(27, I)
#define _u28(I)         _ck_u(28, I)
#define _u29(I)         _ck_u(29, I)
#define _u30(I)         _ck_u(30, I)
#define _u31(I)         _ck_u(31, I)

/* ========================================================================= */
/* --- ASSEMBLER ----------------------------------------------------------- */
/* ========================================================================= */

#define _b00 0
#define _b01 1
#define _b10 2
#define _b11 3

#define _b000 0
#define _b001 1
#define _b010 2
#define _b011 3
#define _b100 4
#define _b101 5
#define _b110 6
#define _b111 7

#define _OFF4(D)    (_UL(D) - _UL(x86_get_target()))
#define _CKD8(D)    _ck_d(8, ((_uc)_OFF4(D)))

#define _D8(D)      (_B(0), ((*(_PUC(x86_get_target()) - 1)) = _CKD8(D)))
#define _D32(D)     (_L(0), ((*(_PUL(x86_get_target()) - 1)) = _OFF4(D)))

#ifndef _ASM_SAFETY
    #define _M(M)      (M)
    #define _r(R)      (R)
    #define _m(M)      (M)
    #define _s(S)      (S)
    #define _i(I)      (I)
    #define _b(B)      (B)
#else
    #define _M(M)      (((M) > 3) ? x86_emit_failure0("internal error: mod = " #M) : (M))
    #define _r(R)      (((R) > 7) ? x86_emit_failure0("internal error: reg = " #R) : (R))
    #define _m(M)      (((M) > 7) ? x86_emit_failure0("internal error: r/m = " #M) : (M))
    #define _s(S)      (((S) > 3) ? x86_emit_failure0("internal error: memory scale = " #S) : (S))
    #define _i(I)      (((I) > 7) ? x86_emit_failure0("internal error: memory index = " #I) : (I))
    #define _b(B)      (((B) > 7) ? x86_emit_failure0("internal error: memory base = "  #B) : (B))
#endif

#define _Mrm(Md, R, M)    _B((_M(Md) << 6) | (_r(R) << 3) | _m(M))
#define _SIB(Sc, I, B)   _B((_s(Sc) << 6) | (_i(I) << 3) | _b(B))

#define _SCL(S)     ((((S) == 1) ? _b00 : \
                      (((S) == 2) ? _b01 : \
                       (((S) == 4) ? _b10 : \
                        (((S) == 8) ? _b11 : x86_emit_failure0("illegal scale: " #S))))))


/* --- Memory subformats - urgh! ------------------------------------------- */

/* _r_D() is RIP addressing mode if X86_TARGET_64BIT, use _r_DSIB() instead */
#define _r_D(R, D) (_Mrm(_b00, _rN(R), _b101), _L((_sl)(D)))
#define _r_DSIB(R, D) (_Mrm(_b00, _rN(R), _b100), _SIB(_SCL(1), _b100, _b101), _L((_sl)(D)))
#define _r_0B(R, B) (_Mrm(_b00, _rN(R), _rA(B)))
#define _r_0BIS(R, B, I, S) (_Mrm(_b00, _rN(R), _b100), _SIB(_SCL(S), _rA(I), _rA(B)))
#define _r_1B(R, D, B) (_Mrm(_b01, _rN(R), _rA(B)), _B((_sc)(D)))
#define _r_1BIS(R, D, B, I, S) (_Mrm(_b01, _rN(R), _b100), _SIB(_SCL(S), _rA(I), _rA(B)), _B((_sc)(D)))
#define _r_4B(R, D, B) (_Mrm(_b10, _rN(R), _rA(B)), _L((_sl)(D)))
#define _r_4IS(R, D, I, S)   (_Mrm(_b00, _rN(R), _b100), _SIB(_SCL(S), _rA(I), _b101), _L((_sl)(D)))
#define _r_4BIS(R, D, B, I, S) (_Mrm(_b10, _rN(R), _b100), _SIB(_SCL(S), _rA(I), _rA(B)), _L((_sl)(D)))

#define _r_DB(R, D, B) ((_s0P(D) && (!_rbp13P(B)) ? _r_0B(R, B) : (_s8P(D) ? _r_1B(R, D, B) : _r_4B(R, D, B))))
#define _r_DBIS(R, D, B, I, S) ((_s0P(D) && (!_rbp13P(B)) ? _r_0BIS(R, B, I, S) : (_s8P(D) ? _r_1BIS(R, D, B, I, S) : _r_4BIS(R, D, B, I, S))))

/* Use RIP-addressing in 64-bit mode, if possible */
#define _x86_RIP_addressing_possible(D, O)   (X86_RIP_RELATIVE_ADDR && \
                                              ((uintptr)x86_get_target() + 4 + (O)-(D) <= 0xffffffff))

#define _r_X(R, D, B, I, S, O)   (_r0P(I) ? (_r0P(B)    ? (!X86_TARGET_64BIT ? _r_D(R, D) : \
                                                           (_x86_RIP_addressing_possible(D, O) ? \
                                                            _r_D(R, (D)-((uintptr)x86_get_target() + 4 + (O))) : \
                                                            _r_DSIB(R, D))) : \
                                             (_rIP(B)    ? _r_D(R, D)   : \
                                              (_rsp12P(B) ? _r_DBIS(R, D, _rSP(), _rSP(), 1)   : \
                                               _r_DB(R, D, B))))  : \
                                  (_r0P(B)          ? _r_4IS(R, D, I, S)   : \
                                   (!_rspP(I)            ? _r_DBIS(R, D, B, I, S)   : \
                                    x86_emit_failure("illegal index register: %esp"))))


/* --- Instruction formats ------------------------------------------------- */

#define _m32only(X)     (!X86_TARGET_64BIT ? X : x86_emit_failure("invalid instruction in 64-bit mode"))
#define _m64only(X)     (X86_TARGET_64BIT ? X : x86_emit_failure("invalid instruction in 32-bit mode"))
#define _m64(X)         (X86_TARGET_64BIT ? X : ((void)0))

/*	 _format						     Opcd	  ModR/M dN(rB,rI,Sc)	  imm... */

#define  _d16()                    (_B(0x66))
#define   _O(OP)  (_B(OP))
#define   _Or(OP, R)  (_B((OP) | _r(R)))
#define  _OO(OP)  (_B((OP) >> 8), _B(((OP)) & 0xff))
#define  _OOr(OP, R)  (_B((OP) >> 8), _B(((OP) | _r(R)) & 0xff))
#define   _Os(OP, B)  (_s8P(B) ? _B(((OP) | _b10)) : _B(OP))
#define     _sW(W)  (_s8P(W) ? _B(W) : _W(W))
#define     _sL(L)  (_s8P(L) ? _B(L) : _L(L))
#define     _sWO(W)  (_s8P(W) ?    1 :   2)
#define     _sLO(L)  (_s8P(L) ?    1 :   4)
#define   _O_B(OP, B)  (_O(OP), _B(B))
#define   _O_W(OP, W)  (_O(OP), _W(W))
#define   _O_L(OP, L)  (_O(OP), _L(L))
#define   _OO_L(OP, L)  (_OO(OP), _L(L))
#define   _O_D8(OP, D)  (_O(OP), _D8(D))
#define   _O_D32(OP, D)  (_O(OP), _D32(D))
#define  _OO_D32(OP, D)  (_OO(OP), _D32(D))
#define   _Os_sW(OP, W)  (_Os(OP, W), _sW(W))
#define   _Os_sL(OP, L)  (_Os(OP, L), _sL(L))
#define   _O_W_B(OP, W, B)  (_O(OP), _W(W), _B(B))
#define   _Or_B(OP, R, B)  (_Or(OP, R), _B(B))
#define   _Or_W(OP, R, W)  (_Or(OP, R), _W(W))
#define   _Or_L(OP, R, L)  (_Or(OP, R), _L(L))
#define   _Or_Q(OP, R, Q)  (_Or(OP, R), _Q(Q))
#define   _O_Mrm(OP, MO, R, M)  (_O(OP), _Mrm(MO, R, M))
#define  _OO_Mrm(OP, MO, R, M)  (_OO(OP), _Mrm(MO, R, M))
#define   _O_Mrm_B(OP, MO, R, M, B)  (_O(OP), _Mrm(MO, R, M), _B(B))
#define   _O_Mrm_W(OP, MO, R, M, W)  (_O(OP), _Mrm(MO, R, M), _W(W))
#define   _O_Mrm_L(OP, MO, R, M, L)  (_O(OP), _Mrm(MO, R, M), _L(L))
#define  _OO_Mrm_B(OP, MO, R, M, B)  (_OO(OP), _Mrm(MO, R, M), _B(B))
#define   _Os_Mrm_sW(OP, MO, R, M, W)  (_Os(OP, W), _Mrm(MO, R, M), _sW(W))
#define   _Os_Mrm_sL(OP, MO, R, M, L)  (_Os(OP, L), _Mrm(MO, R, M), _sL(L))
#define   _O_r_X(OP, R, MD, MB, MI, MS)  (_O(OP), _r_X(R, MD, MB, MI, MS, 0))
#define  _OO_r_X(OP, R, MD, MB, MI, MS)  (_OO(OP), _r_X(R, MD, MB, MI, MS, 0))
#define   _O_r_X_B(OP, R, MD, MB, MI, MS, B)  (_O(OP), _r_X(R, MD, MB, MI, MS, 1), _B(B))
#define   _O_r_X_W(OP, R, MD, MB, MI, MS, W)  (_O(OP), _r_X(R, MD, MB, MI, MS, 2), _W(W))
#define   _O_r_X_L(OP, R, MD, MB, MI, MS, L)  (_O(OP), _r_X(R, MD, MB, MI, MS, 4), _L(L))
#define  _OO_r_X_B(OP, R, MD, MB, MI, MS, B)  (_OO(OP), _r_X(R, MD, MB, MI, MS, 1), _B(B))
#define   _Os_r_X_sW(OP, R, MD, MB, MI, MS, W)  (_Os(OP, W), _r_X(R, MD, MB, MI, MS, _sWO(W)), _sW(W))
#define   _Os_r_X_sL(OP, R, MD, MB, MI, MS, L)  (_Os(OP, L), _r_X(R, MD, MB, MI, MS, _sLO(L)), _sL(L))
#define   _O_X_B(OP, MD, MB, MI, MS, B)  (_O_r_X_B(OP, 0, MD, MB, MI, MS, B))
#define   _O_X_W(OP, MD, MB, MI, MS, W)  (_O_r_X_W(OP, 0, MD, MB, MI, MS, W))
#define   _O_X_L(OP, MD, MB, MI, MS, L)  (_O_r_X_L(OP, 0, MD, MB, MI, MS, L))


/* --- REX prefixes -------------------------------------------------------- */

#define _VOID()         ((void)0)
#define _BIT(X)         (!!(X))
#define _d64(W, R, X, B)       (_B(0x40 | (W) << 3 | (R) << 2 | (X) << 1 | (B)))

#define __REXwrxb(L, W, R, X, B)    ((W | R | X | B) || (L) ? _d64(W, R, X, B) : _VOID())
#define __REXwrx_(L, W, R, X, MR)   (__REXwrxb(L, W, R, X, _BIT(_rIP(MR) ? 0 : _rXP(MR))))
#define __REXw_x_(L, W, R, X, MR)   (__REXwrx_(L, W, _BIT(_rXP(R)), X, MR))
#define __REX_reg(RR)       (__REXwrxb(0, 0, 0, 00, _BIT(_rXP(RR))))
#define __REX_mem(MB, MI)    (__REXwrxb(0, 0, 0, _BIT(_rXP(MI)), _BIT(_rXP(MB))))

// FIXME: can't mix new (SPL,BPL,SIL,DIL) with (AH,BH,CH,DH)
#define _REXBrr(RR, MR)      _m64(__REXw_x_(_r1e8lP(RR) || _r1e8lP(MR), 0, RR, 0, MR))
#define _REXBmr(MB, MI, RD)   _m64(__REXw_x_(_r1e8lP(RD) || _r1e8lP(MB), 0, RD, _BIT(_rXP(MI)), MB))
#define _REXBrm(RS, MB, MI)   _REXBmr(MB, MI, RS)

#define _REXBLrr(RR, MR)     _m64(__REXw_x_(_r1e8lP(MR), 0, RR, 0, MR))
#define _REXLrr(RR, MR)      _m64(__REXw_x_(0, 0, RR, 0, MR))
#define _REXLmr(MB, MI, RD)   _m64(__REXw_x_(0, 0, RD, _BIT(_rXP(MI)), MB))
#define _REXLrm(RS, MB, MI)   _REXLmr(MB, MI, RS)
#define _REXLr(RR)      _m64(__REX_reg(RR))
#define _REXLm(MB, MI)       _m64(__REX_mem(MB, MI))

#define _REXQrr(RR, MR)      _m64only(__REXw_x_(0, 1, RR, 0, MR))
#define _REXQmr(MB, MI, RD)   _m64only(__REXw_x_(0, 1, RD, _BIT(_rXP(MI)), MB))
#define _REXQrm(RS, MB, MI)   _REXQmr(MB, MI, RS)
#define _REXQr(RR)      _m64only(__REX_reg(RR))
#define _REXQm(MB, MI)       _m64only(__REX_mem(MB, MI))


/* ========================================================================= */
/* --- Fully-qualified intrinsic instructions ------------------------------ */
/* ========================================================================= */

/*	OPCODE	+ i	= immediate operand
 *		+ r	= register operand
 *		+ m	= memory operand (disp,base,index,scale)
 *		+ sr/sm	= a star preceding a register or memory
 *		+ 0	= top of stack register (for FPU instructions)
 *
 *	NOTE in x86-64 mode: a memory operand with only a valid
 *	displacement value will lead to the expect absolute mode. If
 *	RIP addressing is necessary, X86_RIP shall be used as the base
 *	register argument.
 */

/* --- ALU instructions ---------------------------------------------------- */

enum
{
    X86_ADD = 0,
    X86_OR = 1,
    X86_ADC = 2,
    X86_SBB = 3,
    X86_AND = 4,
    X86_SUB = 5,
    X86_XOR = 6,
    X86_CMP = 7,
};

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define _ALUBrr(OP, RS, RD)      (_REXBrr(RS, RD), _O_Mrm(((OP) << 3), _b11, _r1(RS), _r1(RD)))
#define _ALUBmr(OP, MD, MB, MI, MS, RD) (_REXBmr(MB, MI, RD), _O_r_X(((OP) << 3) + 2, _r1(RD), MD, MB, MI, MS))
#define _ALUBrm(OP, RS, MD, MB, MI, MS) (_REXBrm(RS, MB, MI), _O_r_X(((OP) << 3), _r1(RS), MD, MB, MI, MS))
#define _ALUBir(OP, IM, RD)     (X86_OPTIMIZE_ALU && ((RD) == X86_AL) ? \
                                 (_REXBrr(0, RD), _O_B(((OP) << 3) + 4, _su8(IM))) : \
                                 (_REXBrr(0, RD), _O_Mrm_B(0x80, _b11, OP, _r1(RD), _su8(IM))))
#define _ALUBim(OP, IM, MD, MB, MI, MS) (_REXBrm(0, MB, MI), _O_r_X_B(0x80, OP, MD, MB, MI, MS, _su8(IM)))

#define _ALUWrr(OP, RS, RD)     (_d16(), _REXLrr(RS, RD), _O_Mrm(((OP) << 3) + 1, _b11, _r2(RS), _r2(RD)))
#define _ALUWmr(OP, MD, MB, MI, MS, RD) (_d16(), _REXLmr(MB, MI, RD), _O_r_X(((OP) << 3) + 3, _r2(RD), MD, MB, MI, MS))
#define _ALUWrm(OP, RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI), _O_r_X(((OP) << 3) + 1, _r2(RS), MD, MB, MI, MS))
#define _ALUWir(OP, IM, RD)     (X86_OPTIMIZE_ALU && ((RD) == X86_AX) ? \
                                 (_d16(), _REXLrr(0, RD), _O_W(((OP) << 3) + 5, _su16(IM))) : \
                                 (_d16(), _REXLrr(0, RD), _Os_Mrm_sW(0x81, _b11, OP, _r2(RD), _su16(IM))))
#define _ALUWim(OP, IM, MD, MB, MI, MS) (_d16(), _REXLrm(0, MB, MI), _Os_r_X_sW(0x81, OP, MD, MB, MI, MS, _su16(IM)))

#define _ALULrr(OP, RS, RD)     (_REXLrr(RS, RD), _O_Mrm(((OP) << 3) + 1, _b11, _r4(RS), _r4(RD)))
#define _ALULmr(OP, MD, MB, MI, MS, RD) (_REXLmr(MB, MI, RD), _O_r_X(((OP) << 3) + 3, _r4(RD), MD, MB, MI, MS))
#define _ALULrm(OP, RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _O_r_X(((OP) << 3) + 1, _r4(RS), MD, MB, MI, MS))
#define _ALULir(OP, IM, RD)     (X86_OPTIMIZE_ALU && ((RD) == X86_EAX) ? \
                                 (_REXLrr(0, RD), _O_L(((OP) << 3) + 5, IM)) : \
                                 (_REXLrr(0, RD), _Os_Mrm_sL(0x81, _b11, OP, _r4(RD), IM)))
#define _ALULim(OP, IM, MD, MB, MI, MS) (_REXLrm(0, MB, MI), _Os_r_X_sL(0x81, OP, MD, MB, MI, MS, IM))

#define _ALUQrr(OP, RS, RD)     (_REXQrr(RS, RD), _O_Mrm(((OP) << 3) + 1, _b11, _r8(RS), _r8(RD)))
#define _ALUQmr(OP, MD, MB, MI, MS, RD) (_REXQmr(MB, MI, RD), _O_r_X(((OP) << 3) + 3, _r8(RD), MD, MB, MI, MS))
#define _ALUQrm(OP, RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI), _O_r_X(((OP) << 3) + 1, _r8(RS), MD, MB, MI, MS))
#define _ALUQir(OP, IM, RD)     (X86_OPTIMIZE_ALU && ((RD) == X86_RAX) ? \
                                 (_REXQrr(0, RD), _O_L(((OP) << 3) + 5, IM)) : \
                                 (_REXQrr(0, RD), _Os_Mrm_sL(0x81, _b11, OP, _r8(RD), IM)))
#define _ALUQim(OP, IM, MD, MB, MI, MS) (_REXQrm(0, MB, MI), _Os_r_X_sL(0x81, OP, MD, MB, MI, MS, IM))

#define ADCBrr(RS, RD)          _ALUBrr(X86_ADC, RS, RD)
#define ADCBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_ADC, MD, MB, MI, MS, RD)
#define ADCBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_ADC, RS, MD, MB, MI, MS)
#define ADCBir(IM, RD)          _ALUBir(X86_ADC, IM, RD)
#define ADCBim(IM, MD, MB, MI, MS)  _ALUBim(X86_ADC, IM, MD, MB, MI, MS)

#define ADCWrr(RS, RD)          _ALUWrr(X86_ADC, RS, RD)
#define ADCWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_ADC, MD, MB, MI, MS, RD)
#define ADCWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_ADC, RS, MD, MB, MI, MS)
#define ADCWir(IM, RD)          _ALUWir(X86_ADC, IM, RD)
#define ADCWim(IM, MD, MB, MI, MS)  _ALUWim(X86_ADC, IM, MD, MB, MI, MS)

#define ADCLrr(RS, RD)          _ALULrr(X86_ADC, RS, RD)
#define ADCLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_ADC, MD, MB, MI, MS, RD)
#define ADCLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_ADC, RS, MD, MB, MI, MS)
#define ADCLir(IM, RD)          _ALULir(X86_ADC, IM, RD)
#define ADCLim(IM, MD, MB, MI, MS)  _ALULim(X86_ADC, IM, MD, MB, MI, MS)

#define ADCQrr(RS, RD)          _ALUQrr(X86_ADC, RS, RD)
#define ADCQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_ADC, MD, MB, MI, MS, RD)
#define ADCQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_ADC, RS, MD, MB, MI, MS)
#define ADCQir(IM, RD)          _ALUQir(X86_ADC, IM, RD)
#define ADCQim(IM, MD, MB, MI, MS)  _ALUQim(X86_ADC, IM, MD, MB, MI, MS)

#define ADDBrr(RS, RD)          _ALUBrr(X86_ADD, RS, RD)
#define ADDBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_ADD, MD, MB, MI, MS, RD)
#define ADDBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_ADD, RS, MD, MB, MI, MS)
#define ADDBir(IM, RD)          _ALUBir(X86_ADD, IM, RD)
#define ADDBim(IM, MD, MB, MI, MS)  _ALUBim(X86_ADD, IM, MD, MB, MI, MS)

#define ADDWrr(RS, RD)          _ALUWrr(X86_ADD, RS, RD)
#define ADDWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_ADD, MD, MB, MI, MS, RD)
#define ADDWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_ADD, RS, MD, MB, MI, MS)
#define ADDWir(IM, RD)          _ALUWir(X86_ADD, IM, RD)
#define ADDWim(IM, MD, MB, MI, MS)  _ALUWim(X86_ADD, IM, MD, MB, MI, MS)

#define ADDLrr(RS, RD)          _ALULrr(X86_ADD, RS, RD)
#define ADDLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_ADD, MD, MB, MI, MS, RD)
#define ADDLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_ADD, RS, MD, MB, MI, MS)
#define ADDLir(IM, RD)          _ALULir(X86_ADD, IM, RD)
#define ADDLim(IM, MD, MB, MI, MS)  _ALULim(X86_ADD, IM, MD, MB, MI, MS)

#define ADDQrr(RS, RD)          _ALUQrr(X86_ADD, RS, RD)
#define ADDQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_ADD, MD, MB, MI, MS, RD)
#define ADDQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_ADD, RS, MD, MB, MI, MS)
#define ADDQir(IM, RD)          _ALUQir(X86_ADD, IM, RD)
#define ADDQim(IM, MD, MB, MI, MS)  _ALUQim(X86_ADD, IM, MD, MB, MI, MS)

#define ANDBrr(RS, RD)          _ALUBrr(X86_AND, RS, RD)
#define ANDBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_AND, MD, MB, MI, MS, RD)
#define ANDBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_AND, RS, MD, MB, MI, MS)
#define ANDBir(IM, RD)          _ALUBir(X86_AND, IM, RD)
#define ANDBim(IM, MD, MB, MI, MS)  _ALUBim(X86_AND, IM, MD, MB, MI, MS)

#define ANDWrr(RS, RD)          _ALUWrr(X86_AND, RS, RD)
#define ANDWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_AND, MD, MB, MI, MS, RD)
#define ANDWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_AND, RS, MD, MB, MI, MS)
#define ANDWir(IM, RD)          _ALUWir(X86_AND, IM, RD)
#define ANDWim(IM, MD, MB, MI, MS)  _ALUWim(X86_AND, IM, MD, MB, MI, MS)

#define ANDLrr(RS, RD)          _ALULrr(X86_AND, RS, RD)
#define ANDLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_AND, MD, MB, MI, MS, RD)
#define ANDLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_AND, RS, MD, MB, MI, MS)
#define ANDLir(IM, RD)          _ALULir(X86_AND, IM, RD)
#define ANDLim(IM, MD, MB, MI, MS)  _ALULim(X86_AND, IM, MD, MB, MI, MS)

#define ANDQrr(RS, RD)          _ALUQrr(X86_AND, RS, RD)
#define ANDQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_AND, MD, MB, MI, MS, RD)
#define ANDQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_AND, RS, MD, MB, MI, MS)
#define ANDQir(IM, RD)          _ALUQir(X86_AND, IM, RD)
#define ANDQim(IM, MD, MB, MI, MS)  _ALUQim(X86_AND, IM, MD, MB, MI, MS)

#define CMPBrr(RS, RD)          _ALUBrr(X86_CMP, RS, RD)
#define CMPBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_CMP, MD, MB, MI, MS, RD)
#define CMPBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_CMP, RS, MD, MB, MI, MS)
#define CMPBir(IM, RD)          _ALUBir(X86_CMP, IM, RD)
#define CMPBim(IM, MD, MB, MI, MS)  _ALUBim(X86_CMP, IM, MD, MB, MI, MS)

#define CMPWrr(RS, RD)          _ALUWrr(X86_CMP, RS, RD)
#define CMPWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_CMP, MD, MB, MI, MS, RD)
#define CMPWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_CMP, RS, MD, MB, MI, MS)
#define CMPWir(IM, RD)          _ALUWir(X86_CMP, IM, RD)
#define CMPWim(IM, MD, MB, MI, MS)  _ALUWim(X86_CMP, IM, MD, MB, MI, MS)

#define CMPLrr(RS, RD)          _ALULrr(X86_CMP, RS, RD)
#define CMPLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_CMP, MD, MB, MI, MS, RD)
#define CMPLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_CMP, RS, MD, MB, MI, MS)
#define CMPLir(IM, RD)          _ALULir(X86_CMP, IM, RD)
#define CMPLim(IM, MD, MB, MI, MS)  _ALULim(X86_CMP, IM, MD, MB, MI, MS)

#define CMPQrr(RS, RD)          _ALUQrr(X86_CMP, RS, RD)
#define CMPQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_CMP, MD, MB, MI, MS, RD)
#define CMPQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_CMP, RS, MD, MB, MI, MS)
#define CMPQir(IM, RD)          _ALUQir(X86_CMP, IM, RD)
#define CMPQim(IM, MD, MB, MI, MS)  _ALUQim(X86_CMP, IM, MD, MB, MI, MS)

#define ORBrr(RS, RD)           _ALUBrr(X86_OR, RS, RD)
#define ORBmr(MD, MB, MI, MS, RD)   _ALUBmr(X86_OR, MD, MB, MI, MS, RD)
#define ORBrm(RS, MD, MB, MI, MS)   _ALUBrm(X86_OR, RS, MD, MB, MI, MS)
#define ORBir(IM, RD)           _ALUBir(X86_OR, IM, RD)
#define ORBim(IM, MD, MB, MI, MS)   _ALUBim(X86_OR, IM, MD, MB, MI, MS)

#define ORWrr(RS, RD)           _ALUWrr(X86_OR, RS, RD)
#define ORWmr(MD, MB, MI, MS, RD)   _ALUWmr(X86_OR, MD, MB, MI, MS, RD)
#define ORWrm(RS, MD, MB, MI, MS)   _ALUWrm(X86_OR, RS, MD, MB, MI, MS)
#define ORWir(IM, RD)           _ALUWir(X86_OR, IM, RD)
#define ORWim(IM, MD, MB, MI, MS)   _ALUWim(X86_OR, IM, MD, MB, MI, MS)

#define ORLrr(RS, RD)           _ALULrr(X86_OR, RS, RD)
#define ORLmr(MD, MB, MI, MS, RD)   _ALULmr(X86_OR, MD, MB, MI, MS, RD)
#define ORLrm(RS, MD, MB, MI, MS)   _ALULrm(X86_OR, RS, MD, MB, MI, MS)
#define ORLir(IM, RD)           _ALULir(X86_OR, IM, RD)
#define ORLim(IM, MD, MB, MI, MS)   _ALULim(X86_OR, IM, MD, MB, MI, MS)

#define ORQrr(RS, RD)           _ALUQrr(X86_OR, RS, RD)
#define ORQmr(MD, MB, MI, MS, RD)   _ALUQmr(X86_OR, MD, MB, MI, MS, RD)
#define ORQrm(RS, MD, MB, MI, MS)   _ALUQrm(X86_OR, RS, MD, MB, MI, MS)
#define ORQir(IM, RD)           _ALUQir(X86_OR, IM, RD)
#define ORQim(IM, MD, MB, MI, MS)   _ALUQim(X86_OR, IM, MD, MB, MI, MS)

#define SBBBrr(RS, RD)          _ALUBrr(X86_SBB, RS, RD)
#define SBBBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_SBB, MD, MB, MI, MS, RD)
#define SBBBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_SBB, RS, MD, MB, MI, MS)
#define SBBBir(IM, RD)          _ALUBir(X86_SBB, IM, RD)
#define SBBBim(IM, MD, MB, MI, MS)  _ALUBim(X86_SBB, IM, MD, MB, MI, MS)

#define SBBWrr(RS, RD)          _ALUWrr(X86_SBB, RS, RD)
#define SBBWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_SBB, MD, MB, MI, MS, RD)
#define SBBWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_SBB, RS, MD, MB, MI, MS)
#define SBBWir(IM, RD)          _ALUWir(X86_SBB, IM, RD)
#define SBBWim(IM, MD, MB, MI, MS)  _ALUWim(X86_SBB, IM, MD, MB, MI, MS)

#define SBBLrr(RS, RD)          _ALULrr(X86_SBB, RS, RD)
#define SBBLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_SBB, MD, MB, MI, MS, RD)
#define SBBLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_SBB, RS, MD, MB, MI, MS)
#define SBBLir(IM, RD)          _ALULir(X86_SBB, IM, RD)
#define SBBLim(IM, MD, MB, MI, MS)  _ALULim(X86_SBB, IM, MD, MB, MI, MS)

#define SBBQrr(RS, RD)          _ALUQrr(X86_SBB, RS, RD)
#define SBBQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_SBB, MD, MB, MI, MS, RD)
#define SBBQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_SBB, RS, MD, MB, MI, MS)
#define SBBQir(IM, RD)          _ALUQir(X86_SBB, IM, RD)
#define SBBQim(IM, MD, MB, MI, MS)  _ALUQim(X86_SBB, IM, MD, MB, MI, MS)

#define SUBBrr(RS, RD)          _ALUBrr(X86_SUB, RS, RD)
#define SUBBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_SUB, MD, MB, MI, MS, RD)
#define SUBBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_SUB, RS, MD, MB, MI, MS)
#define SUBBir(IM, RD)          _ALUBir(X86_SUB, IM, RD)
#define SUBBim(IM, MD, MB, MI, MS)  _ALUBim(X86_SUB, IM, MD, MB, MI, MS)

#define SUBWrr(RS, RD)          _ALUWrr(X86_SUB, RS, RD)
#define SUBWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_SUB, MD, MB, MI, MS, RD)
#define SUBWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_SUB, RS, MD, MB, MI, MS)
#define SUBWir(IM, RD)          _ALUWir(X86_SUB, IM, RD)
#define SUBWim(IM, MD, MB, MI, MS)  _ALUWim(X86_SUB, IM, MD, MB, MI, MS)

#define SUBLrr(RS, RD)          _ALULrr(X86_SUB, RS, RD)
#define SUBLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_SUB, MD, MB, MI, MS, RD)
#define SUBLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_SUB, RS, MD, MB, MI, MS)
#define SUBLir(IM, RD)          _ALULir(X86_SUB, IM, RD)
#define SUBLim(IM, MD, MB, MI, MS)  _ALULim(X86_SUB, IM, MD, MB, MI, MS)

#define SUBQrr(RS, RD)          _ALUQrr(X86_SUB, RS, RD)
#define SUBQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_SUB, MD, MB, MI, MS, RD)
#define SUBQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_SUB, RS, MD, MB, MI, MS)
#define SUBQir(IM, RD)          _ALUQir(X86_SUB, IM, RD)
#define SUBQim(IM, MD, MB, MI, MS)  _ALUQim(X86_SUB, IM, MD, MB, MI, MS)

#define XORBrr(RS, RD)          _ALUBrr(X86_XOR, RS, RD)
#define XORBmr(MD, MB, MI, MS, RD)  _ALUBmr(X86_XOR, MD, MB, MI, MS, RD)
#define XORBrm(RS, MD, MB, MI, MS)  _ALUBrm(X86_XOR, RS, MD, MB, MI, MS)
#define XORBir(IM, RD)          _ALUBir(X86_XOR, IM, RD)
#define XORBim(IM, MD, MB, MI, MS)  _ALUBim(X86_XOR, IM, MD, MB, MI, MS)

#define XORWrr(RS, RD)          _ALUWrr(X86_XOR, RS, RD)
#define XORWmr(MD, MB, MI, MS, RD)  _ALUWmr(X86_XOR, MD, MB, MI, MS, RD)
#define XORWrm(RS, MD, MB, MI, MS)  _ALUWrm(X86_XOR, RS, MD, MB, MI, MS)
#define XORWir(IM, RD)          _ALUWir(X86_XOR, IM, RD)
#define XORWim(IM, MD, MB, MI, MS)  _ALUWim(X86_XOR, IM, MD, MB, MI, MS)

#define XORLrr(RS, RD)          _ALULrr(X86_XOR, RS, RD)
#define XORLmr(MD, MB, MI, MS, RD)  _ALULmr(X86_XOR, MD, MB, MI, MS, RD)
#define XORLrm(RS, MD, MB, MI, MS)  _ALULrm(X86_XOR, RS, MD, MB, MI, MS)
#define XORLir(IM, RD)          _ALULir(X86_XOR, IM, RD)
#define XORLim(IM, MD, MB, MI, MS)  _ALULim(X86_XOR, IM, MD, MB, MI, MS)

#define XORQrr(RS, RD)          _ALUQrr(X86_XOR, RS, RD)
#define XORQmr(MD, MB, MI, MS, RD)  _ALUQmr(X86_XOR, MD, MB, MI, MS, RD)
#define XORQrm(RS, MD, MB, MI, MS)  _ALUQrm(X86_XOR, RS, MD, MB, MI, MS)
#define XORQir(IM, RD)          _ALUQir(X86_XOR, IM, RD)
#define XORQim(IM, MD, MB, MI, MS)  _ALUQim(X86_XOR, IM, MD, MB, MI, MS)


/* --- Shift/Rotate instructions ------------------------------------------- */

enum
{
    X86_ROL = 0,
    X86_ROR = 1,
    X86_RCL = 2,
    X86_RCR = 3,
    X86_SHL = 4,
    X86_SHR = 5,
    X86_SAR = 7,
};

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define _ROTSHIBir(OP, IM, RD)        (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                       (_REXBrr(0, RD), _O_Mrm(0xd0, _b11, OP, _r1(RD))) : \
                                       (_REXBrr(0, RD), _O_Mrm_B(0xc0, _b11, OP, _r1(RD), _u8(IM))))
#define _ROTSHIBim(OP, IM, MD, MB, MI, MS)   (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                              (_REXBrm(0, MB, MI), _O_r_X(0xd0, OP, MD, MB, MI, MS)) : \
                                              (_REXBrm(0, MB, MI), _O_r_X_B(0xc0, OP, MD, MB, MI, MS, _u8(IM))))
#define _ROTSHIBrr(OP, RS, RD)        (((RS) == X86_CL) ? \
                                       (_REXBrr(RS, RD), _O_Mrm(0xd2, _b11, OP, _r1(RD))) : \
                                       x86_emit_failure("source register must be CL"))
#define _ROTSHIBrm(OP, RS, MD, MB, MI, MS)   (((RS) == X86_CL) ? \
                                              (_REXBrm(RS, MB, MI), _O_r_X(0xd2, OP, MD, MB, MI, MS)) : \
                                              x86_emit_failure("source register must be CL"))

#define _ROTSHIWir(OP, IM, RD)        (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                       (_d16(), _REXLrr(0, RD), _O_Mrm(0xd1, _b11, OP, _r2(RD))) : \
                                       (_d16(), _REXLrr(0, RD), _O_Mrm_B(0xc1, _b11, OP, _r2(RD), _u8(IM))))
#define _ROTSHIWim(OP, IM, MD, MB, MI, MS)   (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                              (_d16(), _REXLrm(0, MB, MI), _O_r_X(0xd1, OP, MD, MB, MI, MS)) : \
                                              (_d16(), _REXLrm(0, MB, MI), _O_r_X_B(0xc1, OP, MD, MB, MI, MS, _u8(IM))))
#define _ROTSHIWrr(OP, RS, RD)        (((RS) == X86_CL) ? \
                                       (_d16(), _REXLrr(RS, RD), _O_Mrm(0xd3, _b11, OP, _r2(RD))) : \
                                       x86_emit_failure("source register must be CL"))
#define _ROTSHIWrm(OP, RS, MD, MB, MI, MS)   (((RS) == X86_CL) ? \
                                              (_d16(), _REXLrm(RS, MB, MI), _O_r_X(0xd3, OP, MD, MB, MI, MS)) : \
                                              x86_emit_failure("source register must be CL"))

#define _ROTSHILir(OP, IM, RD)        (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                       (_REXLrr(0, RD), _O_Mrm(0xd1, _b11, OP, _r4(RD))) : \
                                       (_REXLrr(0, RD), _O_Mrm_B(0xc1, _b11, OP, _r4(RD), _u8(IM))))
#define _ROTSHILim(OP, IM, MD, MB, MI, MS)   (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                              (_REXLrm(0, MB, MI), _O_r_X(0xd1, OP, MD, MB, MI, MS)) : \
                                              (_REXLrm(0, MB, MI), _O_r_X_B(0xc1, OP, MD, MB, MI, MS, _u8(IM))))
#define _ROTSHILrr(OP, RS, RD)        (((RS) == X86_CL) ? \
                                       (_REXLrr(RS, RD), _O_Mrm(0xd3, _b11, OP, _r4(RD))) : \
                                       x86_emit_failure("source register must be CL"))
#define _ROTSHILrm(OP, RS, MD, MB, MI, MS)   (((RS) == X86_CL) ? \
                                              (_REXLrm(RS, MB, MI), _O_r_X(0xd3, OP, MD, MB, MI, MS)) : \
                                              x86_emit_failure("source register must be CL"))

#define _ROTSHIQir(OP, IM, RD)        (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                       (_REXQrr(0, RD), _O_Mrm(0xd1, _b11, OP, _r8(RD))) : \
                                       (_REXQrr(0, RD), _O_Mrm_B(0xc1, _b11, OP, _r8(RD), _u8(IM))))
#define _ROTSHIQim(OP, IM, MD, MB, MI, MS)   (X86_OPTIMIZE_ROTSHI && ((IM) == 1) ? \
                                              (_REXQrm(0, MB, MI), _O_r_X(0xd1, OP, MD, MB, MI, MS)) : \
                                              (_REXQrm(0, MB, MI), _O_r_X_B(0xc1, OP, MD, MB, MI, MS, _u8(IM))))
#define _ROTSHIQrr(OP, RS, RD)        (((RS) == X86_CL) ? \
                                       (_REXQrr(RS, RD), _O_Mrm(0xd3, _b11, OP, _r8(RD))) : \
                                       x86_emit_failure("source register must be CL"))
#define _ROTSHIQrm(OP, RS, MD, MB, MI, MS)   (((RS) == X86_CL) ? \
                                              (_REXQrm(RS, MB, MI), _O_r_X(0xd3, OP, MD, MB, MI, MS)) : \
                                              x86_emit_failure("source register must be CL"))

#define ROLBir(IM, RD)          _ROTSHIBir(X86_ROL, IM, RD)
#define ROLBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_ROL, IM, MD, MB, MI, MS)
#define ROLBrr(RS, RD)          _ROTSHIBrr(X86_ROL, RS, RD)
#define ROLBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_ROL, RS, MD, MB, MI, MS)

#define ROLWir(IM, RD)          _ROTSHIWir(X86_ROL, IM, RD)
#define ROLWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_ROL, IM, MD, MB, MI, MS)
#define ROLWrr(RS, RD)          _ROTSHIWrr(X86_ROL, RS, RD)
#define ROLWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_ROL, RS, MD, MB, MI, MS)

#define ROLLir(IM, RD)          _ROTSHILir(X86_ROL, IM, RD)
#define ROLLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_ROL, IM, MD, MB, MI, MS)
#define ROLLrr(RS, RD)          _ROTSHILrr(X86_ROL, RS, RD)
#define ROLLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_ROL, RS, MD, MB, MI, MS)

#define ROLQir(IM, RD)          _ROTSHIQir(X86_ROL, IM, RD)
#define ROLQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_ROL, IM, MD, MB, MI, MS)
#define ROLQrr(RS, RD)          _ROTSHIQrr(X86_ROL, RS, RD)
#define ROLQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_ROL, RS, MD, MB, MI, MS)

#define RORBir(IM, RD)          _ROTSHIBir(X86_ROR, IM, RD)
#define RORBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_ROR, IM, MD, MB, MI, MS)
#define RORBrr(RS, RD)          _ROTSHIBrr(X86_ROR, RS, RD)
#define RORBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_ROR, RS, MD, MB, MI, MS)

#define RORWir(IM, RD)          _ROTSHIWir(X86_ROR, IM, RD)
#define RORWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_ROR, IM, MD, MB, MI, MS)
#define RORWrr(RS, RD)          _ROTSHIWrr(X86_ROR, RS, RD)
#define RORWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_ROR, RS, MD, MB, MI, MS)

#define RORLir(IM, RD)          _ROTSHILir(X86_ROR, IM, RD)
#define RORLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_ROR, IM, MD, MB, MI, MS)
#define RORLrr(RS, RD)          _ROTSHILrr(X86_ROR, RS, RD)
#define RORLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_ROR, RS, MD, MB, MI, MS)

#define RORQir(IM, RD)          _ROTSHIQir(X86_ROR, IM, RD)
#define RORQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_ROR, IM, MD, MB, MI, MS)
#define RORQrr(RS, RD)          _ROTSHIQrr(X86_ROR, RS, RD)
#define RORQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_ROR, RS, MD, MB, MI, MS)

#define RCLBir(IM, RD)          _ROTSHIBir(X86_RCL, IM, RD)
#define RCLBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_RCL, IM, MD, MB, MI, MS)
#define RCLBrr(RS, RD)          _ROTSHIBrr(X86_RCL, RS, RD)
#define RCLBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_RCL, RS, MD, MB, MI, MS)

#define RCLWir(IM, RD)          _ROTSHIWir(X86_RCL, IM, RD)
#define RCLWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_RCL, IM, MD, MB, MI, MS)
#define RCLWrr(RS, RD)          _ROTSHIWrr(X86_RCL, RS, RD)
#define RCLWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_RCL, RS, MD, MB, MI, MS)

#define RCLLir(IM, RD)          _ROTSHILir(X86_RCL, IM, RD)
#define RCLLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_RCL, IM, MD, MB, MI, MS)
#define RCLLrr(RS, RD)          _ROTSHILrr(X86_RCL, RS, RD)
#define RCLLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_RCL, RS, MD, MB, MI, MS)

#define RCLQir(IM, RD)          _ROTSHIQir(X86_RCL, IM, RD)
#define RCLQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_RCL, IM, MD, MB, MI, MS)
#define RCLQrr(RS, RD)          _ROTSHIQrr(X86_RCL, RS, RD)
#define RCLQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_RCL, RS, MD, MB, MI, MS)

#define RCRBir(IM, RD)          _ROTSHIBir(X86_RCR, IM, RD)
#define RCRBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_RCR, IM, MD, MB, MI, MS)
#define RCRBrr(RS, RD)          _ROTSHIBrr(X86_RCR, RS, RD)
#define RCRBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_RCR, RS, MD, MB, MI, MS)

#define RCRWir(IM, RD)          _ROTSHIWir(X86_RCR, IM, RD)
#define RCRWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_RCR, IM, MD, MB, MI, MS)
#define RCRWrr(RS, RD)          _ROTSHIWrr(X86_RCR, RS, RD)
#define RCRWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_RCR, RS, MD, MB, MI, MS)

#define RCRLir(IM, RD)          _ROTSHILir(X86_RCR, IM, RD)
#define RCRLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_RCR, IM, MD, MB, MI, MS)
#define RCRLrr(RS, RD)          _ROTSHILrr(X86_RCR, RS, RD)
#define RCRLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_RCR, RS, MD, MB, MI, MS)

#define RCRQir(IM, RD)          _ROTSHIQir(X86_RCR, IM, RD)
#define RCRQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_RCR, IM, MD, MB, MI, MS)
#define RCRQrr(RS, RD)          _ROTSHIQrr(X86_RCR, RS, RD)
#define RCRQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_RCR, RS, MD, MB, MI, MS)

#define SHLBir(IM, RD)          _ROTSHIBir(X86_SHL, IM, RD)
#define SHLBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_SHL, IM, MD, MB, MI, MS)
#define SHLBrr(RS, RD)          _ROTSHIBrr(X86_SHL, RS, RD)
#define SHLBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_SHL, RS, MD, MB, MI, MS)

#define SHLWir(IM, RD)          _ROTSHIWir(X86_SHL, IM, RD)
#define SHLWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_SHL, IM, MD, MB, MI, MS)
#define SHLWrr(RS, RD)          _ROTSHIWrr(X86_SHL, RS, RD)
#define SHLWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_SHL, RS, MD, MB, MI, MS)

#define SHLLir(IM, RD)          _ROTSHILir(X86_SHL, IM, RD)
#define SHLLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_SHL, IM, MD, MB, MI, MS)
#define SHLLrr(RS, RD)          _ROTSHILrr(X86_SHL, RS, RD)
#define SHLLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_SHL, RS, MD, MB, MI, MS)

#define SHLQir(IM, RD)          _ROTSHIQir(X86_SHL, IM, RD)
#define SHLQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_SHL, IM, MD, MB, MI, MS)
#define SHLQrr(RS, RD)          _ROTSHIQrr(X86_SHL, RS, RD)
#define SHLQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_SHL, RS, MD, MB, MI, MS)

#define SHRBir(IM, RD)          _ROTSHIBir(X86_SHR, IM, RD)
#define SHRBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_SHR, IM, MD, MB, MI, MS)
#define SHRBrr(RS, RD)          _ROTSHIBrr(X86_SHR, RS, RD)
#define SHRBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_SHR, RS, MD, MB, MI, MS)

#define SHRWir(IM, RD)          _ROTSHIWir(X86_SHR, IM, RD)
#define SHRWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_SHR, IM, MD, MB, MI, MS)
#define SHRWrr(RS, RD)          _ROTSHIWrr(X86_SHR, RS, RD)
#define SHRWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_SHR, RS, MD, MB, MI, MS)

#define SHRLir(IM, RD)          _ROTSHILir(X86_SHR, IM, RD)
#define SHRLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_SHR, IM, MD, MB, MI, MS)
#define SHRLrr(RS, RD)          _ROTSHILrr(X86_SHR, RS, RD)
#define SHRLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_SHR, RS, MD, MB, MI, MS)

#define SHRQir(IM, RD)          _ROTSHIQir(X86_SHR, IM, RD)
#define SHRQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_SHR, IM, MD, MB, MI, MS)
#define SHRQrr(RS, RD)          _ROTSHIQrr(X86_SHR, RS, RD)
#define SHRQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_SHR, RS, MD, MB, MI, MS)

#define SALBir SHLBir
#define SALBim SHLBim
#define SALBrr SHLBrr
#define SALBrm SHLBrm

#define SALWir SHLWir
#define SALWim SHLWim
#define SALWrr SHLWrr
#define SALWrm SHLWrm

#define SALLir SHLLir
#define SALLim SHLLim
#define SALLrr SHLLrr
#define SALLrm SHLLrm

#define SALQir SHLQir
#define SALQim SHLQim
#define SALQrr SHLQrr
#define SALQrm SHLQrm

#define SARBir(IM, RD)          _ROTSHIBir(X86_SAR, IM, RD)
#define SARBim(IM, MD, MB, MI, MS)  _ROTSHIBim(X86_SAR, IM, MD, MB, MI, MS)
#define SARBrr(RS, RD)          _ROTSHIBrr(X86_SAR, RS, RD)
#define SARBrm(RS, MD, MB, MI, MS)  _ROTSHIBrm(X86_SAR, RS, MD, MB, MI, MS)

#define SARWir(IM, RD)          _ROTSHIWir(X86_SAR, IM, RD)
#define SARWim(IM, MD, MB, MI, MS)  _ROTSHIWim(X86_SAR, IM, MD, MB, MI, MS)
#define SARWrr(RS, RD)          _ROTSHIWrr(X86_SAR, RS, RD)
#define SARWrm(RS, MD, MB, MI, MS)  _ROTSHIWrm(X86_SAR, RS, MD, MB, MI, MS)

#define SARLir(IM, RD)          _ROTSHILir(X86_SAR, IM, RD)
#define SARLim(IM, MD, MB, MI, MS)  _ROTSHILim(X86_SAR, IM, MD, MB, MI, MS)
#define SARLrr(RS, RD)          _ROTSHILrr(X86_SAR, RS, RD)
#define SARLrm(RS, MD, MB, MI, MS)  _ROTSHILrm(X86_SAR, RS, MD, MB, MI, MS)

#define SARQir(IM, RD)          _ROTSHIQir(X86_SAR, IM, RD)
#define SARQim(IM, MD, MB, MI, MS)  _ROTSHIQim(X86_SAR, IM, MD, MB, MI, MS)
#define SARQrr(RS, RD)          _ROTSHIQrr(X86_SAR, RS, RD)
#define SARQrm(RS, MD, MB, MI, MS)  _ROTSHIQrm(X86_SAR, RS, MD, MB, MI, MS)


/* --- Bit test instructions ----------------------------------------------- */

enum
{
    X86_BT = 4,
    X86_BTS = 5,
    X86_BTR = 6,
    X86_BTC = 7,
};

/*									_format		Opcd		 ,Mod ,r      ,m	,mem=dsp+sib	,imm... */

#define _BTWir(OP, IM, RD)      (_d16(), _REXLrr(0, RD), _OO_Mrm_B(0x0fba, _b11, OP, _r2(RD), _u8(IM)))
#define _BTWim(OP, IM, MD, MB, MI, MS)  (_d16(), _REXLrm(0, MB, MI), _OO_r_X_B(0x0fba, OP, MD, MB, MI, MS, _u8(IM)))
#define _BTWrr(OP, RS, RD)      (_d16(), _REXLrr(RS, RD), _OO_Mrm(0x0f83 | ((OP) << 3), _b11, _r2(RS), _r2(RD)))
#define _BTWrm(OP, RS, MD, MB, MI, MS)  (_d16(), _REXLrm(RS, MB, MI), _OO_r_X(0x0f83 | ((OP) << 3), _r2(RS), MD, MB, MI, MS))

#define _BTLir(OP, IM, RD)      (_REXLrr(0, RD), _OO_Mrm_B(0x0fba, _b11, OP, _r4(RD), _u8(IM)))
#define _BTLim(OP, IM, MD, MB, MI, MS)  (_REXLrm(0, MB, MI), _OO_r_X_B(0x0fba, OP, MD, MB, MI, MS, _u8(IM)))
#define _BTLrr(OP, RS, RD)      (_REXLrr(RS, RD), _OO_Mrm(0x0f83 | ((OP) << 3), _b11, _r4(RS), _r4(RD)))
#define _BTLrm(OP, RS, MD, MB, MI, MS)  (_REXLrm(RS, MB, MI), _OO_r_X(0x0f83 | ((OP) << 3), _r4(RS), MD, MB, MI, MS))

#define _BTQir(OP, IM, RD)      (_REXQrr(0, RD), _OO_Mrm_B(0x0fba, _b11, OP, _r8(RD), _u8(IM)))
#define _BTQim(OP, IM, MD, MB, MI, MS)  (_REXQrm(0, MB, MI), _OO_r_X_B(0x0fba, OP, MD, MB, MI, MS, _u8(IM)))
#define _BTQrr(OP, RS, RD)      (_REXQrr(RS, RD), _OO_Mrm(0x0f83 | ((OP) << 3), _b11, _r8(RS), _r8(RD)))
#define _BTQrm(OP, RS, MD, MB, MI, MS)  (_REXQrm(RS, MB, MI), _OO_r_X(0x0f83 | ((OP) << 3), _r8(RS), MD, MB, MI, MS))

#define BTWir(IM, RD)           _BTWir(X86_BT, IM, RD)
#define BTWim(IM, MD, MB, MI, MS)   _BTWim(X86_BT, IM, MD, MB, MI, MS)
#define BTWrr(RS, RD)           _BTWrr(X86_BT, RS, RD)
#define BTWrm(RS, MD, MB, MI, MS)   _BTWrm(X86_BT, RS, MD, MB, MI, MS)

#define BTLir(IM, RD)           _BTLir(X86_BT, IM, RD)
#define BTLim(IM, MD, MB, MI, MS)   _BTLim(X86_BT, IM, MD, MB, MI, MS)
#define BTLrr(RS, RD)           _BTLrr(X86_BT, RS, RD)
#define BTLrm(RS, MD, MB, MI, MS)   _BTLrm(X86_BT, RS, MD, MB, MI, MS)

#define BTQir(IM, RD)           _BTQir(X86_BT, IM, RD)
#define BTQim(IM, MD, MB, MI, MS)   _BTQim(X86_BT, IM, MD, MB, MI, MS)
#define BTQrr(RS, RD)           _BTQrr(X86_BT, RS, RD)
#define BTQrm(RS, MD, MB, MI, MS)   _BTQrm(X86_BT, RS, MD, MB, MI, MS)

#define BTCWir(IM, RD)          _BTWir(X86_BTC, IM, RD)
#define BTCWim(IM, MD, MB, MI, MS)  _BTWim(X86_BTC, IM, MD, MB, MI, MS)
#define BTCWrr(RS, RD)          _BTWrr(X86_BTC, RS, RD)
#define BTCWrm(RS, MD, MB, MI, MS)  _BTWrm(X86_BTC, RS, MD, MB, MI, MS)

#define BTCLir(IM, RD)          _BTLir(X86_BTC, IM, RD)
#define BTCLim(IM, MD, MB, MI, MS)  _BTLim(X86_BTC, IM, MD, MB, MI, MS)
#define BTCLrr(RS, RD)          _BTLrr(X86_BTC, RS, RD)
#define BTCLrm(RS, MD, MB, MI, MS)  _BTLrm(X86_BTC, RS, MD, MB, MI, MS)

#define BTCQir(IM, RD)          _BTQir(X86_BTC, IM, RD)
#define BTCQim(IM, MD, MB, MI, MS)  _BTQim(X86_BTC, IM, MD, MB, MI, MS)
#define BTCQrr(RS, RD)          _BTQrr(X86_BTC, RS, RD)
#define BTCQrm(RS, MD, MB, MI, MS)  _BTQrm(X86_BTC, RS, MD, MB, MI, MS)

#define BTRWir(IM, RD)          _BTWir(X86_BTR, IM, RD)
#define BTRWim(IM, MD, MB, MI, MS)  _BTWim(X86_BTR, IM, MD, MB, MI, MS)
#define BTRWrr(RS, RD)          _BTWrr(X86_BTR, RS, RD)
#define BTRWrm(RS, MD, MB, MI, MS)  _BTWrm(X86_BTR, RS, MD, MB, MI, MS)

#define BTRLir(IM, RD)          _BTLir(X86_BTR, IM, RD)
#define BTRLim(IM, MD, MB, MI, MS)  _BTLim(X86_BTR, IM, MD, MB, MI, MS)
#define BTRLrr(RS, RD)          _BTLrr(X86_BTR, RS, RD)
#define BTRLrm(RS, MD, MB, MI, MS)  _BTLrm(X86_BTR, RS, MD, MB, MI, MS)

#define BTRQir(IM, RD)          _BTQir(X86_BTR, IM, RD)
#define BTRQim(IM, MD, MB, MI, MS)  _BTQim(X86_BTR, IM, MD, MB, MI, MS)
#define BTRQrr(RS, RD)          _BTQrr(X86_BTR, RS, RD)
#define BTRQrm(RS, MD, MB, MI, MS)  _BTQrm(X86_BTR, RS, MD, MB, MI, MS)

#define BTSWir(IM, RD)          _BTWir(X86_BTS, IM, RD)
#define BTSWim(IM, MD, MB, MI, MS)  _BTWim(X86_BTS, IM, MD, MB, MI, MS)
#define BTSWrr(RS, RD)          _BTWrr(X86_BTS, RS, RD)
#define BTSWrm(RS, MD, MB, MI, MS)  _BTWrm(X86_BTS, RS, MD, MB, MI, MS)

#define BTSLir(IM, RD)          _BTLir(X86_BTS, IM, RD)
#define BTSLim(IM, MD, MB, MI, MS)  _BTLim(X86_BTS, IM, MD, MB, MI, MS)
#define BTSLrr(RS, RD)          _BTLrr(X86_BTS, RS, RD)
#define BTSLrm(RS, MD, MB, MI, MS)  _BTLrm(X86_BTS, RS, MD, MB, MI, MS)

#define BTSQir(IM, RD)          _BTQir(X86_BTS, IM, RD)
#define BTSQim(IM, MD, MB, MI, MS)  _BTQim(X86_BTS, IM, MD, MB, MI, MS)
#define BTSQrr(RS, RD)          _BTQrr(X86_BTS, RS, RD)
#define BTSQrm(RS, MD, MB, MI, MS)  _BTQrm(X86_BTS, RS, MD, MB, MI, MS)


/* --- Move instructions --------------------------------------------------- */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define MOVBrr(RS, RD)          (_REXBrr(RS, RD), _O_Mrm(0x88, _b11, _r1(RS), _r1(RD)))
#define MOVBmr(MD, MB, MI, MS, RD)  (_REXBmr(MB, MI, RD), _O_r_X(0x8a, _r1(RD), MD, MB, MI, MS))
#define MOVBrm(RS, MD, MB, MI, MS)  (_REXBrm(RS, MB, MI), _O_r_X(0x88, _r1(RS), MD, MB, MI, MS))
#define MOVBir(IM, R)          (_REXBrr(0, R), _Or_B(0xb0, _r1(R), _su8(IM)))
#define MOVBim(IM, MD, MB, MI, MS)  (_REXBrm(0, MB, MI), _O_X_B(0xc6, MD, MB, MI, MS, _su8(IM)))

#define MOVWrr(RS, RD)          (_d16(), _REXLrr(RS, RD), _O_Mrm(0x89, _b11, _r2(RS), _r2(RD)))
#define MOVWmr(MD, MB, MI, MS, RD)  (_d16(), _REXLmr(MB, MI, RD), _O_r_X(0x8b, _r2(RD), MD, MB, MI, MS))
#define MOVWrm(RS, MD, MB, MI, MS)  (_d16(), _REXLrm(RS, MB, MI), _O_r_X(0x89, _r2(RS), MD, MB, MI, MS))
#define MOVWir(IM, R)          (_d16(), _REXLrr(0, R), _Or_W(0xb8, _r2(R), _su16(IM)))
#define MOVWim(IM, MD, MB, MI, MS)  (_d16(), _REXLrm(0, MB, MI), _O_X_W(0xc7, MD, MB, MI, MS, _su16(IM)))

#define MOVLrr(RS, RD)          (_REXLrr(RS, RD), _O_Mrm(0x89, _b11, _r4(RS), _r4(RD)))
#define MOVLmr(MD, MB, MI, MS, RD)  (_REXLmr(MB, MI, RD), _O_r_X(0x8b, _r4(RD), MD, MB, MI, MS))
#define MOVLrm(RS, MD, MB, MI, MS)  (_REXLrm(RS, MB, MI), _O_r_X(0x89, _r4(RS), MD, MB, MI, MS))
#define MOVLir(IM, R)          (_REXLrr(0, R), _Or_L(0xb8, _r4(R), IM))
#define MOVLim(IM, MD, MB, MI, MS)  (_REXLrm(0, MB, MI), _O_X_L(0xc7, MD, MB, MI, MS, IM))

#define MOVQrr(RS, RD)          (_REXQrr(RS, RD), _O_Mrm(0x89, _b11, _r8(RS), _r8(RD)))
#define MOVQmr(MD, MB, MI, MS, RD)  (_REXQmr(MB, MI, RD), _O_r_X(0x8b, _r8(RD), MD, MB, MI, MS))
#define MOVQrm(RS, MD, MB, MI, MS)  (_REXQrm(RS, MB, MI), _O_r_X(0x89, _r8(RS), MD, MB, MI, MS))
#define MOVQir(IM, R)          (_REXQrr(0, R), _Or_Q(0xb8, _r8(R), IM))
#define MOVQim(IM, MD, MB, MI, MS)  (_REXQrm(0, MB, MI), _O_X_L(0xc7, MD, MB, MI, MS, IM))


/* --- Unary and Multiply/Divide instructions ------------------------------ */

enum
{
    X86_NOT = 2,
    X86_NEG = 3,
    X86_MUL = 4,
    X86_IMUL = 5,
    X86_DIV = 6,
    X86_IDIV = 7,
};

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define _UNARYBr(OP, RS)        (_REXBrr(0, RS), _O_Mrm(0xf6, _b11, OP, _r1(RS)))
#define _UNARYBm(OP, MD, MB, MI, MS)    (_REXBrm(0, MB, MI), _O_r_X(0xf6, OP, MD, MB, MI, MS))
#define _UNARYWr(OP, RS)        (_d16(), _REXLrr(0, RS), _O_Mrm(0xf7, _b11, OP, _r2(RS)))
#define _UNARYWm(OP, MD, MB, MI, MS)    (_d16(), _REXLmr(MB, MI, 0), _O_r_X(0xf7, OP, MD, MB, MI, MS))
#define _UNARYLr(OP, RS)        (_REXLrr(0, RS), _O_Mrm(0xf7, _b11, OP, _r4(RS)))
#define _UNARYLm(OP, MD, MB, MI, MS)    (_REXLmr(MB, MI, 0), _O_r_X(0xf7, OP, MD, MB, MI, MS))
#define _UNARYQr(OP, RS)        (_REXQrr(0, RS), _O_Mrm(0xf7, _b11, OP, _r8(RS)))
#define _UNARYQm(OP, MD, MB, MI, MS)    (_REXQmr(MB, MI, 0), _O_r_X(0xf7, OP, MD, MB, MI, MS))

#define NOTBr(RS)           _UNARYBr(X86_NOT, RS)
#define NOTBm(MD, MB, MI, MS)       _UNARYBm(X86_NOT, MD, MB, MI, MS)
#define NOTWr(RS)           _UNARYWr(X86_NOT, RS)
#define NOTWm(MD, MB, MI, MS)       _UNARYWm(X86_NOT, MD, MB, MI, MS)
#define NOTLr(RS)           _UNARYLr(X86_NOT, RS)
#define NOTLm(MD, MB, MI, MS)       _UNARYLm(X86_NOT, MD, MB, MI, MS)
#define NOTQr(RS)           _UNARYQr(X86_NOT, RS)
#define NOTQm(MD, MB, MI, MS)       _UNARYQm(X86_NOT, MD, MB, MI, MS)

#define NEGBr(RS)           _UNARYBr(X86_NEG, RS)
#define NEGBm(MD, MB, MI, MS)       _UNARYBm(X86_NEG, MD, MB, MI, MS)
#define NEGWr(RS)           _UNARYWr(X86_NEG, RS)
#define NEGWm(MD, MB, MI, MS)       _UNARYWm(X86_NEG, MD, MB, MI, MS)
#define NEGLr(RS)           _UNARYLr(X86_NEG, RS)
#define NEGLm(MD, MB, MI, MS)       _UNARYLm(X86_NEG, MD, MB, MI, MS)
#define NEGQr(RS)           _UNARYQr(X86_NEG, RS)
#define NEGQm(MD, MB, MI, MS)       _UNARYQm(X86_NEG, MD, MB, MI, MS)

#define MULBr(RS)           _UNARYBr(X86_MUL, RS)
#define MULBm(MD, MB, MI, MS)       _UNARYBm(X86_MUL, MD, MB, MI, MS)
#define MULWr(RS)           _UNARYWr(X86_MUL, RS)
#define MULWm(MD, MB, MI, MS)       _UNARYWm(X86_MUL, MD, MB, MI, MS)
#define MULLr(RS)           _UNARYLr(X86_MUL, RS)
#define MULLm(MD, MB, MI, MS)       _UNARYLm(X86_MUL, MD, MB, MI, MS)
#define MULQr(RS)           _UNARYQr(X86_MUL, RS)
#define MULQm(MD, MB, MI, MS)       _UNARYQm(X86_MUL, MD, MB, MI, MS)

#define IMULBr(RS)          _UNARYBr(X86_IMUL, RS)
#define IMULBm(MD, MB, MI, MS)      _UNARYBm(X86_IMUL, MD, MB, MI, MS)
#define IMULWr(RS)          _UNARYWr(X86_IMUL, RS)
#define IMULWm(MD, MB, MI, MS)      _UNARYWm(X86_IMUL, MD, MB, MI, MS)
#define IMULLr(RS)          _UNARYLr(X86_IMUL, RS)
#define IMULLm(MD, MB, MI, MS)      _UNARYLm(X86_IMUL, MD, MB, MI, MS)
#define IMULQr(RS)          _UNARYQr(X86_IMUL, RS)
#define IMULQm(MD, MB, MI, MS)      _UNARYQm(X86_IMUL, MD, MB, MI, MS)

#define DIVBr(RS)           _UNARYBr(X86_DIV, RS)
#define DIVBm(MD, MB, MI, MS)       _UNARYBm(X86_DIV, MD, MB, MI, MS)
#define DIVWr(RS)           _UNARYWr(X86_DIV, RS)
#define DIVWm(MD, MB, MI, MS)       _UNARYWm(X86_DIV, MD, MB, MI, MS)
#define DIVLr(RS)           _UNARYLr(X86_DIV, RS)
#define DIVLm(MD, MB, MI, MS)       _UNARYLm(X86_DIV, MD, MB, MI, MS)
#define DIVQr(RS)           _UNARYQr(X86_DIV, RS)
#define DIVQm(MD, MB, MI, MS)       _UNARYQm(X86_DIV, MD, MB, MI, MS)

#define IDIVBr(RS)          _UNARYBr(X86_IDIV, RS)
#define IDIVBm(MD, MB, MI, MS)      _UNARYBm(X86_IDIV, MD, MB, MI, MS)
#define IDIVWr(RS)          _UNARYWr(X86_IDIV, RS)
#define IDIVWm(MD, MB, MI, MS)      _UNARYWm(X86_IDIV, MD, MB, MI, MS)
#define IDIVLr(RS)          _UNARYLr(X86_IDIV, RS)
#define IDIVLm(MD, MB, MI, MS)      _UNARYLm(X86_IDIV, MD, MB, MI, MS)
#define IDIVQr(RS)          _UNARYQr(X86_IDIV, RS)
#define IDIVQm(MD, MB, MI, MS)      _UNARYQm(X86_IDIV, MD, MB, MI, MS)

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define IMULWrr(RS, RD)         (_d16(), _REXLrr(RD, RS), _OO_Mrm(0x0faf, _b11, _r2(RD), _r2(RS)))
#define IMULWmr(MD, MB, MI, MS, RD) (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0faf, _r2(RD), MD, MB, MI, MS))

#define IMULWirr(IM, RS, RD)      (_d16(), _REXLrr(RS, RD), _Os_Mrm_sW(0x69, _b11, _r2(RS), _r2(RD), _su16(IM)))
#define IMULWimr(IM, MD, MB, MI, MS, RD) (_d16(), _REXLmr(MB, MI, RD), _Os_r_X_sW(0x69, _r2(RD), MD, MB, MI, MS, _su16(IM)))

#define IMULLir(IM, RD)         (_REXLrr(0, RD), _Os_Mrm_sL(0x69, _b11, _r4(RD), _r4(RD), IM))
#define IMULLrr(RS, RD)         (_REXLrr(RD, RS), _OO_Mrm(0x0faf, _b11, _r4(RD), _r4(RS)))
#define IMULLmr(MD, MB, MI, MS, RD) (_REXLmr(MB, MI, RD), _OO_r_X(0x0faf, _r4(RD), MD, MB, MI, MS))

#define IMULQir(IM, RD)         (_REXQrr(0, RD), _Os_Mrm_sL(0x69, _b11, _r8(RD), _r8(RD), IM))
#define IMULQrr(RS, RD)         (_REXQrr(RD, RS), _OO_Mrm(0x0faf, _b11, _r8(RD), _r8(RS)))
#define IMULQmr(MD, MB, MI, MS, RD) (_REXQmr(MB, MI, RD), _OO_r_X(0x0faf, _r8(RD), MD, MB, MI, MS))

#define IMULLirr(IM, RS, RD)      (_REXLrr(RS, RD), _Os_Mrm_sL(0x69, _b11, _r4(RS), _r4(RD), IM))
#define IMULLimr(IM, MD, MB, MI, MS, RD) (_REXLmr(MB, MI, RD), _Os_r_X_sL(0x69, _r4(RD), MD, MB, MI, MS, IM))

#define IMULQirr(IM, RS, RD)      (_REXQrr(RS, RD), _Os_Mrm_sL(0x69, _b11, _r8(RS), _r8(RD), IM))
#define IMULQimr(IM, MD, MB, MI, MS, RD) (_REXQmr(MB, MI, RD), _Os_r_X_sL(0x69, _r8(RD), MD, MB, MI, MS, IM))


/* --- Control Flow related instructions ----------------------------------- */

enum
{
    X86_CC_O = 0x0,
    X86_CC_NO = 0x1,
    X86_CC_NAE = 0x2,
    X86_CC_B = 0x2,
    X86_CC_C = 0x2,
    X86_CC_AE = 0x3,
    X86_CC_NB = 0x3,
    X86_CC_NC = 0x3,
    X86_CC_E = 0x4,
    X86_CC_Z = 0x4,
    X86_CC_NE = 0x5,
    X86_CC_NZ = 0x5,
    X86_CC_BE = 0x6,
    X86_CC_NA = 0x6,
    X86_CC_A = 0x7,
    X86_CC_NBE = 0x7,
    X86_CC_S = 0x8,
    X86_CC_NS = 0x9,
    X86_CC_P = 0xa,
    X86_CC_PE = 0xa,
    X86_CC_NP = 0xb,
    X86_CC_PO = 0xb,
    X86_CC_L = 0xc,
    X86_CC_NGE = 0xc,
    X86_CC_GE = 0xd,
    X86_CC_NL = 0xd,
    X86_CC_LE = 0xe,
    X86_CC_NG = 0xe,
    X86_CC_G = 0xf,
    X86_CC_NLE = 0xf,
};

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

// FIXME: no prefix is availble to encode a 32-bit operand size in 64-bit mode
#define CALLm(M)                            _O_D32(0xe8, (int)(M))
#define _CALLLsr(R)         (_REXLrr(0, R), _O_Mrm(0xff, _b11, _b010, _r4(R)))
#define _CALLQsr(R)         (_REXLrr(0, R), _O_Mrm(0xff, _b11, _b010, _r8(R)))
#define CALLsr(R)           (X86_TARGET_64BIT ? _CALLQsr(R) : _CALLLsr(R))
#define CALLsm(D, B, I, S)         (_REXLrm(0, B, I), _O_r_X(0xff, _b010, (int)(D), B, I, S))

// FIXME: no prefix is availble to encode a 32-bit operand size in 64-bit mode
#define JMPSm(M)                            _O_D8(0xeb, (int)(M))
#define JMPm(M)                             _O_D32(0xe9, (int)(M))
#define _JMPLsr(R)          (_REXLrr(0, R), _O_Mrm(0xff, _b11, _b100, _r4(R)))
#define _JMPQsr(R)          (_REXLrr(0, R), _O_Mrm(0xff, _b11, _b100, _r8(R)))
#define JMPsr(R)            (X86_TARGET_64BIT ? _JMPQsr(R) : _JMPLsr(R))
#define JMPsm(D, B, I, S)          (_REXLrm(0, B, I), _O_r_X(0xff, _b100, (int)(D), B, I, S))

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */
#define JCCSii(CC, D)                           _O_B(0x70 | (CC), (_sc)(int)(D))
#define JCCSim(CC, D)                           _O_D8(0x70 | (CC), (int)(D))
#define JOSm(D)             JCCSim(X86_CC_O, D)
#define JNOSm(D)            JCCSim(X86_CC_NO, D)
#define JBSm(D)             JCCSim(X86_CC_B, D)
#define JNAESm(D)           JCCSim(X86_CC_NAE, D)
#define JNBSm(D)            JCCSim(X86_CC_NB, D)
#define JAESm(D)            JCCSim(X86_CC_AE, D)
#define JESm(D)             JCCSim(X86_CC_E, D)
#define JZSm(D)             JCCSim(X86_CC_Z, D)
#define JNESm(D)            JCCSim(X86_CC_NE, D)
#define JNZSm(D)            JCCSim(X86_CC_NZ, D)
#define JBESm(D)            JCCSim(X86_CC_BE, D)
#define JNASm(D)            JCCSim(X86_CC_NA, D)
#define JNBESm(D)           JCCSim(X86_CC_NBE, D)
#define JASm(D)             JCCSim(X86_CC_A, D)
#define JSSm(D)             JCCSim(X86_CC_S, D)
#define JNSSm(D)            JCCSim(X86_CC_NS, D)
#define JPSm(D)             JCCSim(X86_CC_P, D)
#define JPESm(D)            JCCSim(X86_CC_PE, D)
#define JNPSm(D)            JCCSim(X86_CC_NP, D)
#define JPOSm(D)            JCCSim(X86_CC_PO, D)
#define JLSm(D)             JCCSim(X86_CC_L, D)
#define JNGESm(D)           JCCSim(X86_CC_NGE, D)
#define JNLSm(D)            JCCSim(X86_CC_NL, D)
#define JGESm(D)            JCCSim(X86_CC_GE, D)
#define JLESm(D)            JCCSim(X86_CC_LE, D)
#define JNGSm(D)            JCCSim(X86_CC_NG, D)
#define JNLESm(D)           JCCSim(X86_CC_NLE, D)
#define JGSm(D)             JCCSim(X86_CC_G, D)

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */
#define JCCii(CC, D)                            _OO_L(0x0f80 | (CC), (int)(D))
#define JCCim(CC, D)                            _OO_D32(0x0f80 | (CC), (int)(D))
#define JOm(D)              JCCim(X86_CC_O, D)
#define JNOm(D)             JCCim(X86_CC_NO, D)
#define JBm(D)              JCCim(X86_CC_B, D)
#define JNAEm(D)            JCCim(X86_CC_NAE, D)
#define JNBm(D)             JCCim(X86_CC_NB, D)
#define JAEm(D)             JCCim(X86_CC_AE, D)
#define JEm(D)              JCCim(X86_CC_E, D)
#define JZm(D)              JCCim(X86_CC_Z, D)
#define JNEm(D)             JCCim(X86_CC_NE, D)
#define JNZm(D)             JCCim(X86_CC_NZ, D)
#define JBEm(D)             JCCim(X86_CC_BE, D)
#define JNAm(D)             JCCim(X86_CC_NA, D)
#define JNBEm(D)            JCCim(X86_CC_NBE, D)
#define JAm(D)              JCCim(X86_CC_A, D)
#define JSm(D)              JCCim(X86_CC_S, D)
#define JNSm(D)             JCCim(X86_CC_NS, D)
#define JPm(D)              JCCim(X86_CC_P, D)
#define JPEm(D)             JCCim(X86_CC_PE, D)
#define JNPm(D)             JCCim(X86_CC_NP, D)
#define JPOm(D)             JCCim(X86_CC_PO, D)
#define JLm(D)              JCCim(X86_CC_L, D)
#define JNGEm(D)            JCCim(X86_CC_NGE, D)
#define JNLm(D)             JCCim(X86_CC_NL, D)
#define JGEm(D)             JCCim(X86_CC_GE, D)
#define JLEm(D)             JCCim(X86_CC_LE, D)
#define JNGm(D)             JCCim(X86_CC_NG, D)
#define JNLEm(D)            JCCim(X86_CC_NLE, D)
#define JGm(D)              JCCim(X86_CC_G, D)

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */
#define SETCCir(CC, RD)         (_REXBrr(0, RD), _OO_Mrm(0x0f90 | (CC), _b11, _b000, _r1(RD)))
#define SETOr(RD)           SETCCir(X86_CC_O, RD)
#define SETNOr(RD)          SETCCir(X86_CC_NO, RD)
#define SETBr(RD)           SETCCir(X86_CC_B, RD)
#define SETNAEr(RD)         SETCCir(X86_CC_NAE, RD)
#define SETNBr(RD)          SETCCir(X86_CC_NB, RD)
#define SETAEr(RD)          SETCCir(X86_CC_AE, RD)
#define SETEr(RD)           SETCCir(X86_CC_E, RD)
#define SETZr(RD)           SETCCir(X86_CC_Z, RD)
#define SETNEr(RD)          SETCCir(X86_CC_NE, RD)
#define SETNZr(RD)          SETCCir(X86_CC_NZ, RD)
#define SETBEr(RD)          SETCCir(X86_CC_BE, RD)
#define SETNAr(RD)          SETCCir(X86_CC_NA, RD)
#define SETNBEr(RD)         SETCCir(X86_CC_NBE, RD)
#define SETAr(RD)           SETCCir(X86_CC_A, RD)
#define SETSr(RD)           SETCCir(X86_CC_S, RD)
#define SETNSr(RD)          SETCCir(X86_CC_NS, RD)
#define SETPr(RD)           SETCCir(X86_CC_P, RD)
#define SETPEr(RD)          SETCCir(X86_CC_PE, RD)
#define SETNPr(RD)          SETCCir(X86_CC_NP, RD)
#define SETPOr(RD)          SETCCir(X86_CC_PO, RD)
#define SETLr(RD)           SETCCir(X86_CC_L, RD)
#define SETNGEr(RD)         SETCCir(X86_CC_NGE, RD)
#define SETNLr(RD)          SETCCir(X86_CC_NL, RD)
#define SETGEr(RD)          SETCCir(X86_CC_GE, RD)
#define SETLEr(RD)          SETCCir(X86_CC_LE, RD)
#define SETNGr(RD)          SETCCir(X86_CC_NG, RD)
#define SETNLEr(RD)         SETCCir(X86_CC_NLE, RD)
#define SETGr(RD)           SETCCir(X86_CC_G, RD)

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */
#define SETCCim(CC, MD, MB, MI, MS)     (_REXBrm(0, MB, MI), _OO_r_X(0x0f90 | (CC), _b000, MD, MB, MI, MS))
#define SETOm(D, B, I, S)       SETCCim(X86_CC_O, D, B, I, S)
#define SETNOm(D, B, I, S)      SETCCim(X86_CC_NO, D, B, I, S)
#define SETBm(D, B, I, S)       SETCCim(X86_CC_B, D, B, I, S)
#define SETNAEm(D, B, I, S)     SETCCim(X86_CC_NAE, D, B, I, S)
#define SETNBm(D, B, I, S)      SETCCim(X86_CC_NB, D, B, I, S)
#define SETAEm(D, B, I, S)      SETCCim(X86_CC_AE, D, B, I, S)
#define SETEm(D, B, I, S)       SETCCim(X86_CC_E, D, B, I, S)
#define SETZm(D, B, I, S)       SETCCim(X86_CC_Z, D, B, I, S)
#define SETNEm(D, B, I, S)      SETCCim(X86_CC_NE, D, B, I, S)
#define SETNZm(D, B, I, S)      SETCCim(X86_CC_NZ, D, B, I, S)
#define SETBEm(D, B, I, S)      SETCCim(X86_CC_BE, D, B, I, S)
#define SETNAm(D, B, I, S)      SETCCim(X86_CC_NA, D, B, I, S)
#define SETNBEm(D, B, I, S)     SETCCim(X86_CC_NBE, D, B, I, S)
#define SETAm(D, B, I, S)       SETCCim(X86_CC_A, D, B, I, S)
#define SETSm(D, B, I, S)       SETCCim(X86_CC_S, D, B, I, S)
#define SETNSm(D, B, I, S)      SETCCim(X86_CC_NS, D, B, I, S)
#define SETPm(D, B, I, S)       SETCCim(X86_CC_P, D, B, I, S)
#define SETPEm(D, B, I, S)      SETCCim(X86_CC_PE, D, B, I, S)
#define SETNPm(D, B, I, S)      SETCCim(X86_CC_NP, D, B, I, S)
#define SETPOm(D, B, I, S)      SETCCim(X86_CC_PO, D, B, I, S)
#define SETLm(D, B, I, S)       SETCCim(X86_CC_L, D, B, I, S)
#define SETNGEm(D, B, I, S)     SETCCim(X86_CC_NGE, D, B, I, S)
#define SETNLm(D, B, I, S)      SETCCim(X86_CC_NL, D, B, I, S)
#define SETGEm(D, B, I, S)      SETCCim(X86_CC_GE, D, B, I, S)
#define SETLEm(D, B, I, S)      SETCCim(X86_CC_LE, D, B, I, S)
#define SETNGm(D, B, I, S)      SETCCim(X86_CC_NG, D, B, I, S)
#define SETNLEm(D, B, I, S)     SETCCim(X86_CC_NLE, D, B, I, S)
#define SETGm(D, B, I, S)       SETCCim(X86_CC_G, D, B, I, S)

/*									_format		Opcd		,Mod ,r	     ,m		,mem=dsp+sib	,imm... */
#define CMOVWrr(CC, RS, RD)       (_d16(), _REXLrr(RD, RS), _OO_Mrm(0x0f40 | (CC), _b11, _r2(RD), _r2(RS)))
#define CMOVWmr(CC, MD, MB, MI, MS, RD)  (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0f40 | (CC), _r2(RD), MD, MB, MI, MS))
#define CMOVLrr(CC, RS, RD)       (_REXLrr(RD, RS), _OO_Mrm(0x0f40 | (CC), _b11, _r4(RD), _r4(RS)))
#define CMOVLmr(CC, MD, MB, MI, MS, RD)  (_REXLmr(MB, MI, RD), _OO_r_X(0x0f40 | (CC), _r4(RD), MD, MB, MI, MS))
#define CMOVQrr(CC, RS, RD)       (_REXQrr(RD, RS), _OO_Mrm(0x0f40 | (CC), _b11, _r8(RD), _r8(RS)))
#define CMOVQmr(CC, MD, MB, MI, MS, RD)  (_REXQmr(MB, MI, RD), _OO_r_X(0x0f40 | (CC), _r8(RD), MD, MB, MI, MS))


/* --- Push/Pop instructions ----------------------------------------------- */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define POPWr(RD)           _m32only((_d16(), _Or(0x58, _r2(RD))))
#define POPWm(MD, MB, MI, MS)       _m32only((_d16(), _O_r_X(0x8f, _b000, MD, MB, MI, MS)))

#define POPLr(RD)           _m32only(_Or(0x58, _r4(RD)))
#define POPLm(MD, MB, MI, MS)       _m32only(_O_r_X(0x8f, _b000, MD, MB, MI, MS))

#define POPQr(RD)           _m64only((_REXQr(RD), _Or(0x58, _r8(RD))))
#define POPQm(MD, MB, MI, MS)       _m64only((_REXQm(MB, MI), _O_r_X(0x8f, _b000, MD, MB, MI, MS)))

#define PUSHWr(RS)          _m32only((_d16(), _Or(0x50, _r2(RS))))
#define PUSHWm(MD, MB, MI, MS)      _m32only((_d16(), _O_r_X(0xff, , _b110, MD, MB, MI, MS)))
#define PUSHWi(IM)          _m32only((_d16(), _Os_sW(0x68, IM)))

#define PUSHLr(RS)          _m32only(_Or(0x50, _r4(RS)))
#define PUSHLm(MD, MB, MI, MS)      _m32only(_O_r_X(0xff, _b110, MD, MB, MI, MS))
#define PUSHLi(IM)          _m32only(_Os_sL(0x68, IM))

#define PUSHQr(RS)          _m64only((_REXQr(RS), _Or(0x50, _r8(RS))))
#define PUSHQm(MD, MB, MI, MS)      _m64only((_REXQm(MB, MI), _O_r_X(0xff, _b110, MD, MB, MI, MS)))
#define PUSHQi(IM)          _m64only(_Os_sL(0x68, IM))

#define POPA()              (_d16(), _O(0x61))
#define POPAD()                             _O(0x61)

#define PUSHA()             (_d16(), _O(0x60))
#define PUSHAD()                            _O(0x60)

#define POPF()                              _O(0x9d)
#define PUSHF()                             _O(0x9c)


/* --- Test instructions --------------------------------------------------- */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define TESTBrr(RS, RD)         (_REXBrr(RS, RD), _O_Mrm(0x84, _b11, _r1(RS), _r1(RD)))
#define TESTBrm(RS, MD, MB, MI, MS) (_REXBrm(RS, MB, MI), _O_r_X(0x84, _r1(RS), MD, MB, MI, MS))
#define TESTBir(IM, RD)         (X86_OPTIMIZE_ALU && ((RD) == X86_AL) ? \
                                 (_REXBrr(0, RD), _O_B(0xa8, _u8(IM))) : \
                                 (_REXBrr(0, RD), _O_Mrm_B(0xf6, _b11, _b000, _r1(RD), _u8(IM))))
#define TESTBim(IM, MD, MB, MI, MS) (_REXBrm(0, MB, MI), _O_r_X_B(0xf6, _b000, MD, MB, MI, MS, _u8(IM)))

#define TESTWrr(RS, RD)         (_d16(), _REXLrr(RS, RD), _O_Mrm(0x85, _b11, _r2(RS), _r2(RD)))
#define TESTWrm(RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI), _O_r_X(0x85, _r2(RS), MD, MB, MI, MS))
#define TESTWir(IM, RD)         (X86_OPTIMIZE_ALU && ((RD) == X86_AX) ? \
                                 (_d16(), _REXLrr(0, RD), _O_W(0xa9, _u16(IM))) : \
                                 (_d16(), _REXLrr(0, RD), _O_Mrm_W(0xf7, _b11, _b000, _r2(RD), _u16(IM))))
#define TESTWim(IM, MD, MB, MI, MS) (_d16(), _REXLrm(0, MB, MI), _O_r_X_W(0xf7, _b000, MD, MB, MI, MS, _u16(IM)))

#define TESTLrr(RS, RD)         (_REXLrr(RS, RD), _O_Mrm(0x85, _b11, _r4(RS), _r4(RD)))
#define TESTLrm(RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _O_r_X(0x85, _r4(RS), MD, MB, MI, MS))
#define TESTLir(IM, RD)         (X86_OPTIMIZE_ALU && ((RD) == X86_EAX) ? \
                                 (_REXLrr(0, RD), _O_L(0xa9, IM)) : \
                                 (_REXLrr(0, RD), _O_Mrm_L(0xf7, _b11, _b000, _r4(RD), IM)))
#define TESTLim(IM, MD, MB, MI, MS) (_REXLrm(0, MB, MI), _O_r_X_L(0xf7, _b000, MD, MB, MI, MS, IM))

#define TESTQrr(RS, RD)         (_REXQrr(RS, RD), _O_Mrm(0x85, _b11, _r8(RS), _r8(RD)))
#define TESTQrm(RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI), _O_r_X(0x85, _r8(RS), MD, MB, MI, MS))
#define TESTQir(IM, RD)         (X86_OPTIMIZE_ALU && ((RD) == X86_RAX) ? \
                                 (_REXQrr(0, RD), _O_L(0xa9, IM)) : \
                                 (_REXQrr(0, RD), _O_Mrm_L(0xf7, _b11, _b000, _r8(RD), IM)))
#define TESTQim(IM, MD, MB, MI, MS) (_REXQrm(0, MB, MI), _O_r_X_L(0xf7, _b000, MD, MB, MI, MS, IM))


/* --- Exchange instructions ----------------------------------------------- */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define CMPXCHGBrr(RS, RD)      (_REXBrr(RS, RD), _OO_Mrm(0x0fb0, _b11, _r1(RS), _r1(RD)))
#define CMPXCHGBrm(RS, MD, MB, MI, MS)  (_REXBrm(RS, MB, MI), _OO_r_X(0x0fb0, _r1(RS), MD, MB, MI, MS))

#define CMPXCHGWrr(RS, RD)      (_d16(), _REXLrr(RS, RD), _OO_Mrm(0x0fb1, _b11, _r2(RS), _r2(RD)))
#define CMPXCHGWrm(RS, MD, MB, MI, MS)  (_d16(), _REXLrm(RS, MB, MI), _OO_r_X(0x0fb1, _r2(RS), MD, MB, MI, MS))

#define CMPXCHGLrr(RS, RD)      (_REXLrr(RS, RD), _OO_Mrm(0x0fb1, _b11, _r4(RS), _r4(RD)))
#define CMPXCHGLrm(RS, MD, MB, MI, MS)  (_REXLrm(RS, MB, MI), _OO_r_X(0x0fb1, _r4(RS), MD, MB, MI, MS))

#define CMPXCHGQrr(RS, RD)      (_REXQrr(RS, RD), _OO_Mrm(0x0fb1, _b11, _r8(RS), _r8(RD)))
#define CMPXCHGQrm(RS, MD, MB, MI, MS)  (_REXQrm(RS, MB, MI), _OO_r_X(0x0fb1, _r8(RS), MD, MB, MI, MS))

#define XADDBrr(RS, RD)         (_REXBrr(RS, RD), _OO_Mrm(0x0fc0, _b11, _r1(RS), _r1(RD)))
#define XADDBrm(RS, MD, MB, MI, MS) (_REXBrm(RS, MB, MI), _OO_r_X(0x0fc0, _r1(RS), MD, MB, MI, MS))

#define XADDWrr(RS, RD)         (_d16(), _REXLrr(RS, RD), _OO_Mrm(0x0fc1, _b11, _r2(RS), _r2(RD)))
#define XADDWrm(RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI), _OO_r_X(0x0fc1, _r2(RS), MD, MB, MI, MS))

#define XADDLrr(RS, RD)         (_REXLrr(RS, RD), _OO_Mrm(0x0fc1, _b11, _r4(RS), _r4(RD)))
#define XADDLrm(RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _OO_r_X(0x0fc1, _r4(RS), MD, MB, MI, MS))

#define XADDQrr(RS, RD)         (_REXQrr(RS, RD), _OO_Mrm(0x0fc1, _b11, _r8(RS), _r8(RD)))
#define XADDQrm(RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI), _OO_r_X(0x0fc1, _r8(RS), MD, MB, MI, MS))

#define XCHGBrr(RS, RD)         (_REXBrr(RS, RD), _O_Mrm(0x86, _b11, _r1(RS), _r1(RD)))
#define XCHGBrm(RS, MD, MB, MI, MS) (_REXBrm(RS, MB, MI), _O_r_X(0x86, _r1(RS), MD, MB, MI, MS))

#define XCHGWrr(RS, RD)         (_d16(), _REXLrr(RS, RD), _O_Mrm(0x87, _b11, _r2(RS), _r2(RD)))
#define XCHGWrm(RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI), _O_r_X(0x87, _r2(RS), MD, MB, MI, MS))

#define XCHGLrr(RS, RD)         (_REXLrr(RS, RD), _O_Mrm(0x87, _b11, _r4(RS), _r4(RD)))
#define XCHGLrm(RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _O_r_X(0x87, _r4(RS), MD, MB, MI, MS))

#define XCHGQrr(RS, RD)         (_REXQrr(RS, RD), _O_Mrm(0x87, _b11, _r8(RS), _r8(RD)))
#define XCHGQrm(RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI), _O_r_X(0x87, _r8(RS), MD, MB, MI, MS))


/* --- Increment/Decrement instructions ------------------------------------ */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define DECBm(MD, MB, MI, MS)       (_REXBrm(0, MB, MI), _O_r_X(0xfe, _b001, MD, MB, MI, MS))
#define DECBr(RD)           (_REXBrr(0, RD), _O_Mrm(0xfe, _b11, _b001, _r1(RD)))

#define DECWm(MD, MB, MI, MS)       (_d16(), _REXLrm(0, MB, MI), _O_r_X(0xff, _b001, MD, MB, MI, MS))
#define DECWr(RD)           (!X86_TARGET_64BIT ? (_d16(), _Or(0x48, _r2(RD))) : \
                             (_d16(), _REXLrr(0, RD), _O_Mrm(0xff, _b11, _b001, _r2(RD))))

#define DECLm(MD, MB, MI, MS)       (_REXLrm(0, MB, MI), _O_r_X(0xff, _b001, MD, MB, MI, MS))
#define DECLr(RD)           (!X86_TARGET_64BIT ?       _Or(0x48, _r4(RD)) : \
                             (_REXLrr(0, RD), _O_Mrm(0xff, _b11, _b001, _r4(RD))))

#define DECQm(MD, MB, MI, MS)       (_REXQrm(0, MB, MI), _O_r_X(0xff, _b001, MD, MB, MI, MS))
#define DECQr(RD)           (_REXQrr(0, RD), _O_Mrm(0xff, _b11, _b001, _r8(RD)))

#define INCBm(MD, MB, MI, MS)       (_REXBrm(0, MB, MI), _O_r_X(0xfe, _b000, MD, MB, MI, MS))
#define INCBr(RD)           (_REXBrr(0, RD), _O_Mrm(0xfe, _b11, _b000, _r1(RD)))

#define INCWm(MD, MB, MI, MS)       (_d16(), _REXLrm(0, MB, MI), _O_r_X(0xff, _b000, MD, MB, MI, MS))
#define INCWr(RD)           (!X86_TARGET_64BIT ? (_d16(), _Or(0x40, _r2(RD))) : \
                             (_d16(), _REXLrr(0, RD), _O_Mrm(0xff, _b11, _b000, _r2(RD))))

#define INCLm(MD, MB, MI, MS)       (_REXLrm(0, MB, MI), _O_r_X(0xff, _b000, MD, MB, MI, MS))
#define INCLr(RD)           (!X86_TARGET_64BIT ?       _Or(0x40, _r4(RD)) : \
                             (_REXLrr(0, RD), _O_Mrm(0xff, _b11, _b000, _r4(RD))))

#define INCQm(MD, MB, MI, MS)       (_REXQrm(0, MB, MI), _O_r_X(0xff, _b000, MD, MB, MI, MS))
#define INCQr(RD)           (_REXQrr(0, RD), _O_Mrm(0xff, _b11, _b000, _r8(RD)))


/* --- Misc instructions --------------------------------------------------- */

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define BSFWrr(RS, RD)          (_d16(), _REXLrr(RD, RS), _OO_Mrm(0x0fbc, _b11, _r2(RD), _r2(RS)))
#define BSFWmr(MD, MB, MI, MS, RD)  (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0fbc, _r2(RD), MD, MB, MI, MS))
#define BSRWrr(RS, RD)          (_d16(), _REXLrr(RD, RS), _OO_Mrm(0x0fbd, _b11, _r2(RD), _r2(RS)))
#define BSRWmr(MD, MB, MI, MS, RD)  (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0fbd, _r2(RD), MD, MB, MI, MS))

#define BSFLrr(RS, RD)          (_REXLrr(RD, RS), _OO_Mrm(0x0fbc, _b11, _r4(RD), _r4(RS)))
#define BSFLmr(MD, MB, MI, MS, RD)  (_REXLmr(MB, MI, RD), _OO_r_X(0x0fbc, _r4(RD), MD, MB, MI, MS))
#define BSRLrr(RS, RD)          (_REXLrr(RD, RS), _OO_Mrm(0x0fbd, _b11, _r4(RD), _r4(RS)))
#define BSRLmr(MD, MB, MI, MS, RD)  (_REXLmr(MB, MI, RD), _OO_r_X(0x0fbd, _r4(RD), MD, MB, MI, MS))

#define BSFQrr(RS, RD)          (_REXQrr(RD, RS), _OO_Mrm(0x0fbc, _b11, _r8(RD), _r8(RS)))
#define BSFQmr(MD, MB, MI, MS, RD)  (_REXQmr(MB, MI, RD), _OO_r_X(0x0fbc, _r8(RD), MD, MB, MI, MS))
#define BSRQrr(RS, RD)          (_REXQrr(RD, RS), _OO_Mrm(0x0fbd, _b11, _r8(RD), _r8(RS)))
#define BSRQmr(MD, MB, MI, MS, RD)  (_REXQmr(MB, MI, RD), _OO_r_X(0x0fbd, _r8(RD), MD, MB, MI, MS))

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define MOVSBWrr(RS, RD)        (_d16(), _REXBLrr(RD, RS), _OO_Mrm(0x0fbe, _b11, _r2(RD), _r1(RS)))
#define MOVSBWmr(MD, MB, MI, MS, RD)    (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0fbe, _r2(RD), MD, MB, MI, MS))
#define MOVZBWrr(RS, RD)        (_d16(), _REXBLrr(RD, RS), _OO_Mrm(0x0fb6, _b11, _r2(RD), _r1(RS)))
#define MOVZBWmr(MD, MB, MI, MS, RD)    (_d16(), _REXLmr(MB, MI, RD), _OO_r_X(0x0fb6, _r2(RD), MD, MB, MI, MS))

#define MOVSBLrr(RS, RD)        (_REXBLrr(RD, RS), _OO_Mrm(0x0fbe, _b11, _r4(RD), _r1(RS)))
#define MOVSBLmr(MD, MB, MI, MS, RD)    (_REXLmr(MB, MI, RD), _OO_r_X(0x0fbe, _r4(RD), MD, MB, MI, MS))
#define MOVZBLrr(RS, RD)        (_REXBLrr(RD, RS), _OO_Mrm(0x0fb6, _b11, _r4(RD), _r1(RS)))
#define MOVZBLmr(MD, MB, MI, MS, RD)    (_REXLmr(MB, MI, RD), _OO_r_X(0x0fb6, _r4(RD), MD, MB, MI, MS))

#define MOVSBQrr(RS, RD)        (_REXQrr(RD, RS), _OO_Mrm(0x0fbe, _b11, _r8(RD), _r1(RS)))
#define MOVSBQmr(MD, MB, MI, MS, RD)    (_REXQmr(MB, MI, RD), _OO_r_X(0x0fbe, _r8(RD), MD, MB, MI, MS))
#define MOVZBQrr(RS, RD)        (_REXQrr(RD, RS), _OO_Mrm(0x0fb6, _b11, _r8(RD), _r1(RS)))
#define MOVZBQmr(MD, MB, MI, MS, RD)    (_REXQmr(MB, MI, RD), _OO_r_X(0x0fb6, _r8(RD), MD, MB, MI, MS))

#define MOVSWLrr(RS, RD)        (_REXLrr(RD, RS), _OO_Mrm(0x0fbf, _b11, _r4(RD), _r2(RS)))
#define MOVSWLmr(MD, MB, MI, MS, RD)    (_REXLmr(MB, MI, RD), _OO_r_X(0x0fbf, _r4(RD), MD, MB, MI, MS))
#define MOVZWLrr(RS, RD)        (_REXLrr(RD, RS), _OO_Mrm(0x0fb7, _b11, _r4(RD), _r2(RS)))
#define MOVZWLmr(MD, MB, MI, MS, RD)    (_REXLmr(MB, MI, RD), _OO_r_X(0x0fb7, _r4(RD), MD, MB, MI, MS))

#define MOVSWQrr(RS, RD)        _m64only((_REXQrr(RD, RS), _OO_Mrm(0x0fbf, _b11, _r8(RD), _r2(RS))))
#define MOVSWQmr(MD, MB, MI, MS, RD)    _m64only((_REXQmr(MB, MI, RD), _OO_r_X(0x0fbf, _r8(RD), MD, MB, MI, MS)))
#define MOVZWQrr(RS, RD)        _m64only((_REXQrr(RD, RS), _OO_Mrm(0x0fb7, _b11, _r8(RD), _r2(RS))))
#define MOVZWQmr(MD, MB, MI, MS, RD)    _m64only((_REXQmr(MB, MI, RD), _OO_r_X(0x0fb7, _r8(RD), MD, MB, MI, MS)))

#define MOVSLQrr(RS, RD)        _m64only((_REXQrr(RD, RS), _O_Mrm(0x63, _b11, _r8(RD), _r4(RS))))
#define MOVSLQmr(MD, MB, MI, MS, RD)    _m64only((_REXQmr(MB, MI, RD), _O_r_X(0x63, _r8(RD), MD, MB, MI, MS)))

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define LEALmr(MD, MB, MI, MS, RD)  (_REXLmr(MB, MI, RD), _O_r_X(0x8d, _r4(RD), MD, MB, MI, MS))
#define LEAQmr(MD, MB, MI, MS, RD)  (_REXQmr(MB, MI, RD), _O_r_X(0x8d, _r4(RD), MD, MB, MI, MS))

#define BSWAPLr(R)          (_REXLrr(0, R), _OOr(0x0fc8, _r4(R)))
#define BSWAPQr(R)          (_REXQrr(0, R), _OOr(0x0fc8, _r8(R)))

#define CLC()                               _O(0xf8)
#define STC()                               _O(0xf9)
#define CMC()                               _O(0xf5)

#define CLD()                               _O(0xfc)
#define STD()                               _O(0xfd)

#define CBTW()              (_d16(), _O(0x98))
#define CWTL()                              _O(0x98)
#define CLTQ()              _m64only(_REXQrr(0, 0), _O(0x98))

#define CBW CBTW
#define CWDE CWTL
#define CDQE CLTQ

#define CWTD()              (_d16(), _O(0x99))
#define CLTD()                              _O(0x99)
#define CQTO()              _m64only(_REXQrr(0, 0), _O(0x99))

#define CWD CWTD
#define CDQ CLTD
#define CQO CQTO

#define LAHF()                              _O(0x9f)
#define SAHF()                              _O(0x9e)

/*									_format		Opcd		,Mod ,r	    ,m		,mem=dsp+sib	,imm... */

#define CPUID()                             _OO(0x0fa2)
#define RDTSC()                             _OO(0xff31)

#define ENTERii(W, B)                           _O_W_B(0xc8, _su16(W), _su8(B))

#define LEAVE()                             _O(0xc9)
#define RET()                               _O(0xc3)
#define RETi(IM)                            _O_W(0xc2, _su16(IM))

#define NOP()                               _O(0x90)


/* --- Media 64-bit instructions ------------------------------------------- */

enum
{
    X86_MMX_PABSB = 0x1c,   // 2P
    X86_MMX_PABSW = 0x1d,   // 2P
    X86_MMX_PABSD = 0x1e,   // 2P
    X86_MMX_PACKSSWB = 0x63,
    X86_MMX_PACKSSDW = 0x6b,
    X86_MMX_PACKUSWB = 0x67,
    X86_MMX_PADDB = 0xfc,
    X86_MMX_PADDW = 0xfd,
    X86_MMX_PADDD = 0xfe,
    X86_MMX_PADDQ = 0xd4,
    X86_MMX_PADDSB = 0xec,
    X86_MMX_PADDSW = 0xed,
    X86_MMX_PADDUSB = 0xdc,
    X86_MMX_PADDUSW = 0xdd,
    X86_MMX_PAND = 0xdb,
    X86_MMX_PANDN = 0xdf,
    X86_MMX_PAVGB = 0xe0,
    X86_MMX_PAVGW = 0xe3,
    X86_MMX_PCMPEQB = 0x74,
    X86_MMX_PCMPEQW = 0x75,
    X86_MMX_PCMPEQD = 0x76,
    X86_MMX_PCMPGTB = 0x64,
    X86_MMX_PCMPGTW = 0x65,
    X86_MMX_PCMPGTD = 0x66,
    X86_MMX_PEXTRW = 0xc5,  // 64, /r ib
    X86_MMX_PHADDW = 0x01,  // 2P
    X86_MMX_PHADDD = 0x02,  // 2P
    X86_MMX_PHADDSW = 0x03, // 2P
    X86_MMX_PHSUBW = 0x05,  // 2P
    X86_MMX_PHSUBD = 0x06,  // 2P
    X86_MMX_PHSUBSW = 0x07, // 2P
    X86_MMX_PINSRW = 0xc4,  // 64, /r ib
    X86_MMX_PMADDUBSW = 0x04, // 2P
    X86_MMX_PMADDWD = 0xf5,
    X86_MMX_PMAXSW = 0xee,
    X86_MMX_PMAXUB = 0xde,
    X86_MMX_PMINSW = 0xea,
    X86_MMX_PMINUB = 0xda,
    X86_MMX_PMOVMSKB = 0xd7, // 64
    X86_MMX_PMULHRSW = 0x0b, // 2P
    X86_MMX_PMULHUW = 0xe4,
    X86_MMX_PMULHW = 0xe5,
    X86_MMX_PMULLW = 0xd5,
    X86_MMX_PMULUDQ = 0xf4,
    X86_MMX_POR = 0xeb,
    X86_MMX_PSADBW = 0xf6,
    X86_MMX_PSHUFB = 0x00,  // 2P
    X86_MMX_PSHUFW = 0x70,  // /r ib
    X86_MMX_PSIGNB = 0x08,  // 2P
    X86_MMX_PSIGNW = 0x09,  // 2P
    X86_MMX_PSIGND = 0x0a,  // 2P
    X86_MMX_PSLLW = 0xf1,
    X86_MMX_PSLLWi = 0x71,  // /6 ib
    X86_MMX_PSLLD = 0xf2,
    X86_MMX_PSLLDi = 0x72,  // /6 ib
    X86_MMX_PSLLQ = 0xf3,
    X86_MMX_PSLLQi = 0x73,  // /6 ib
    X86_MMX_PSRAW = 0xe1,
    X86_MMX_PSRAWi = 0x71,  // /4 ib
    X86_MMX_PSRAD = 0xe2,
    X86_MMX_PSRADi = 0x72,  // /4 ib
    X86_MMX_PSRLW = 0xd1,
    X86_MMX_PSRLWi = 0x71,  // /2 ib
    X86_MMX_PSRLD = 0xd2,
    X86_MMX_PSRLDi = 0x72,  // /2 ib
    X86_MMX_PSRLQ = 0xd3,
    X86_MMX_PSRLQi = 0x73,  // /2 ib
    X86_MMX_PSUBB = 0xf8,
    X86_MMX_PSUBW = 0xf9,
    X86_MMX_PSUBD = 0xfa,
    X86_MMX_PSUBQ = 0xfb,
    X86_MMX_PSUBSB = 0xe8,
    X86_MMX_PSUBSW = 0xe9,
    X86_MMX_PSUBUSB = 0xd8,
    X86_MMX_PSUBUSW = 0xd9,
    X86_MMX_PUNPCKHBW = 0x68,
    X86_MMX_PUNPCKHWD = 0x69,
    X86_MMX_PUNPCKHDQ = 0x6a,
    X86_MMX_PUNPCKLBW = 0x60,
    X86_MMX_PUNPCKLWD = 0x61,
    X86_MMX_PUNPCKLDQ = 0x62,
    X86_MMX_PXOR = 0xef,
};

#define __MMXLrr(OP, RS, RSA, RD, RDA)      (_REXLrr(RD, RS), _OO_Mrm(0x0f00 | (OP), _b11, RDA(RD), RSA(RS)))
#define __MMXLmr(OP, MD, MB, MI, MS, RD, RDA)     (_REXLmr(MB, MI, RD), _OO_r_X(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS))
#define __MMXLrm(OP, RS, RSA, MD, MB, MI, MS)     (_REXLrm(RS, MB, MI), _OO_r_X(0x0f00 | (OP), RSA(RS), MD, MB, MI, MS))
#define __MMXLirr(OP, IM, RS, RSA, RD, RDA)      (_REXLrr(RD, RS), _OO_Mrm_B(0x0f00 | (OP), _b11, RDA(RD), RSA(RS), _u8(IM)))
#define __MMXLimr(OP, IM, MD, MB, MI, MS, RD, RDA) (_REXLmr(MB, MI, RS), _OO_r_X_B(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS, _u8(IM)))
#define __MMXQrr(OP, RS, RSA, RD, RDA)      (_REXQrr(RD, RS), _OO_Mrm(0x0f00 | (OP), _b11, RDA(RD), RSA(RS)))
#define __MMXQmr(OP, MD, MB, MI, MS, RD, RDA)     (_REXQmr(MB, MI, RD), _OO_r_X(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS))
#define __MMXQrm(OP, RS, RSA, MD, MB, MI, MS)     (_REXQrm(RS, MB, MI), _OO_r_X(0x0f00 | (OP), RSA(RS), MD, MB, MI, MS))
#define __MMXQirr(OP, IM, RS, RSA, RD, RDA)      (_REXQrr(RD, RS), _OO_Mrm_B(0x0f00 | (OP), _b11, RDA(RD), RSA(RS), _u8(IM)))
#define __MMXQimr(OP, IM, MD, MB, MI, MS, RD, RDA) (_REXQmr(MB, MI, RS), _OO_r_X_B(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS, _u8(IM)))
#define __MMX1Lrr(PX, OP, RS, RSA, RD, RDA)      (_REXLrr(RD, RS), _B(0x0f), _OO_Mrm(((PX) << 8) | (OP), _b11, RDA(RD), RSA(RS)))
#define __MMX1Lmr(PX, OP, MD, MB, MI, MS, RD, RDA) (_REXLmr(MB, MI, RD), _B(0x0f), _OO_r_X(((PX) << 8) | (OP), RDA(RD), MD, MB, MI, MS))
#define __MMX1Lrm(PX, OP, RS, RSA, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _B(0x0f), _OO_r_X(((PX) << 8) | (OP), RSA(RS), MD, MB, MI, MS))

#define _MMXLrr(OP, RS, RD)       __MMXLrr(OP, RS, _rM, RD, _rM)
#define _MMXLmr(OP, MD, MB, MI, MS, RD)  __MMXLmr(OP, MD, MB, MI, MS, RD, _rM)
#define _MMXLrm(OP, RS, MD, MB, MI, MS)  __MMXLrm(OP, RS, _rM, MD, MB, MI, MS)
#define _MMXQrr(OP, RS, RD)       __MMXQrr(OP, RS, _rM, RD, _rM)
#define _MMXQmr(OP, MD, MB, MI, MS, RD)  __MMXQmr(OP, MD, MB, MI, MS, RD, _rM)
#define _MMXQrm(OP, RS, MD, MB, MI, MS)  __MMXQrm(OP, RS, _rM, MD, MB, MI, MS)
#define _2P_MMXLrr(OP, RS, RD)        __MMX1Lrr(0x38, OP, RS, _rM, RD, _rM)
#define _2P_MMXLmr(OP, MD, MB, MI, MS, RD)   __MMX1Lmr(0x38, OP, MD, MB, MI, MS, RD, _rM)
#define _2P_MMXLrm(OP, RS, MD, MB, MI, MS)   __MMX1Lrm(0x38, OP, RS, _rM, MD, MB, MI, MS)

#define MMX_MOVDMDrr(RS, RD)        __MMXLrr(0x6e, RS, _r4, RD, _rM)
#define MMX_MOVQMDrr(RS, RD)        __MMXQrr(0x6e, RS, _r8, RD, _rM)
#define MMX_MOVDMSrr(RS, RD)        __MMXLrr(0x7e, RD, _r4, RS, _rM)
#define MMX_MOVQMSrr(RS, RD)        __MMXQrr(0x7e, RD, _r8, RS, _rM)

#define MMX_MOVDmr(MD, MB, MI, MS, RD)  _MMXLmr(0x6e, MD, MB, MI, MS, RD)
#define MMX_MOVDrm(RS, MD, MB, MI, MS)  _MMXLrm(0x7e, RS, MD, MB, MI, MS)
#define MMX_MOVQrr(RS, RD)      _MMXLrr(0x6f, RS, RD)
#define MMX_MOVQmr(MD, MB, MI, MS, RD)  _MMXLmr(0x6f, MD, MB, MI, MS, RD)
#define MMX_MOVQrm(RS, MD, MB, MI, MS)  _MMXLrm(0x7f, RS, MD, MB, MI, MS)

// Original MMX instructions
#define MMX_PACKSSWBrr(RS, RD)      _MMXLrr(X86_MMX_PACKSSWB, RS, RD)
#define MMX_PACKSSWBmr(MD, MB, MI, MS, RD)  _MMXLmr(X86_MMX_PACKSSWB, MD, MB, MI, MS, RD)
#define MMX_PACKSSDWrr(RS, RD)      _MMXLrr(X86_MMX_PACKSSDW, RS, RD)
#define MMX_PACKSSDWmr(MD, MB, MI, MS, RD)  _MMXLmr(X86_MMX_PACKSSDW, MD, MB, MI, MS, RD)
#define MMX_PACKUSWBrr(RS, RD)      _MMXLrr(X86_MMX_PACKUSWB, RS, RD)
#define MMX_PACKUSWBmr(MD, MB, MI, MS, RD)  _MMXLmr(X86_MMX_PACKUSWB, MD, MB, MI, MS, RD)
#define MMX_PADDBrr(RS, RD)     _MMXLrr(X86_MMX_PADDB, RS, RD)
#define MMX_PADDBmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PADDB, MD, MB, MI, MS, RD)
#define MMX_PADDWrr(RS, RD)     _MMXLrr(X86_MMX_PADDW, RS, RD)
#define MMX_PADDWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PADDW, MD, MB, MI, MS, RD)
#define MMX_PADDDrr(RS, RD)     _MMXLrr(X86_MMX_PADDD, RS, RD)
#define MMX_PADDDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PADDD, MD, MB, MI, MS, RD)
#define MMX_PADDQrr(RS, RD)     _MMXLrr(X86_MMX_PADDQ, RS, RD)
#define MMX_PADDQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PADDQ, MD, MB, MI, MS, RD)
#define MMX_PADDSBrr(RS, RD)        _MMXLrr(X86_MMX_PADDSB, RS, RD)
#define MMX_PADDSBmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PADDSB, MD, MB, MI, MS, RD)
#define MMX_PADDSWrr(RS, RD)        _MMXLrr(X86_MMX_PADDSW, RS, RD)
#define MMX_PADDSWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PADDSW, MD, MB, MI, MS, RD)
#define MMX_PADDUSBrr(RS, RD)       _MMXLrr(X86_MMX_PADDUSB, RS, RD)
#define MMX_PADDUSBmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PADDUSB, MD, MB, MI, MS, RD)
#define MMX_PADDUSWrr(RS, RD)       _MMXLrr(X86_MMX_PADDUSW, RS, RD)
#define MMX_PADDUSWmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PADDUSW, MD, MB, MI, MS, RD)
#define MMX_PANDrr(RS, RD)      _MMXLrr(X86_MMX_PAND, RS, RD)
#define MMX_PANDmr(MD, MB, MI, MS, RD)  _MMXLmr(X86_MMX_PAND, MD, MB, MI, MS, RD)
#define MMX_PANDNrr(RS, RD)     _MMXLrr(X86_MMX_PANDN, RS, RD)
#define MMX_PANDNmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PANDN, MD, MB, MI, MS, RD)
#define MMX_PAVGBrr(RS, RD)     _MMXLrr(X86_MMX_PAVGB, RS, RD)
#define MMX_PAVGBmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PAVGB, MD, MB, MI, MS, RD)
#define MMX_PAVGWrr(RS, RD)     _MMXLrr(X86_MMX_PAVGW, RS, RD)
#define MMX_PAVGWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PAVGW, MD, MB, MI, MS, RD)
#define MMX_PCMPEQBrr(RS, RD)       _MMXLrr(X86_MMX_PCMPEQB, RS, RD)
#define MMX_PCMPEQBmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPEQB, MD, MB, MI, MS, RD)
#define MMX_PCMPEQWrr(RS, RD)       _MMXLrr(X86_MMX_PCMPEQW, RS, RD)
#define MMX_PCMPEQWmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPEQW, MD, MB, MI, MS, RD)
#define MMX_PCMPEQDrr(RS, RD)       _MMXLrr(X86_MMX_PCMPEQD, RS, RD)
#define MMX_PCMPEQDmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPEQD, MD, MB, MI, MS, RD)
#define MMX_PCMPGTBrr(RS, RD)       _MMXLrr(X86_MMX_PCMPGTB, RS, RD)
#define MMX_PCMPGTBmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPGTB, MD, MB, MI, MS, RD)
#define MMX_PCMPGTWrr(RS, RD)       _MMXLrr(X86_MMX_PCMPGTW, RS, RD)
#define MMX_PCMPGTWmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPGTW, MD, MB, MI, MS, RD)
#define MMX_PCMPGTDrr(RS, RD)       _MMXLrr(X86_MMX_PCMPGTD, RS, RD)
#define MMX_PCMPGTDmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PCMPGTD, MD, MB, MI, MS, RD)
#define MMX_PMADDWDrr(RS, RD)       _MMXLrr(X86_MMX_PMADDWD, RS, RD)
#define MMX_PMADDWDmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PMADDWD, MD, MB, MI, MS, RD)
#define MMX_PMAXSWrr(RS, RD)        _MMXLrr(X86_MMX_PMAXSW, RS, RD)
#define MMX_PMAXSWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMAXSW, MD, MB, MI, MS, RD)
#define MMX_PMAXUBrr(RS, RD)        _MMXLrr(X86_MMX_PMAXUB, RS, RD)
#define MMX_PMAXUBmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMAXUB, MD, MB, MI, MS, RD)
#define MMX_PMINSWrr(RS, RD)        _MMXLrr(X86_MMX_PMINSW, RS, RD)
#define MMX_PMINSWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMINSW, MD, MB, MI, MS, RD)
#define MMX_PMINUBrr(RS, RD)        _MMXLrr(X86_MMX_PMINUB, RS, RD)
#define MMX_PMINUBmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMINUB, MD, MB, MI, MS, RD)
#define MMX_PMULHUWrr(RS, RD)       _MMXLrr(X86_MMX_PMULHUW, RS, RD)
#define MMX_PMULHUWmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PMULHUW, MD, MB, MI, MS, RD)
#define MMX_PMULHWrr(RS, RD)        _MMXLrr(X86_MMX_PMULHW, RS, RD)
#define MMX_PMULHWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMULHW, MD, MB, MI, MS, RD)
#define MMX_PMULLWrr(RS, RD)        _MMXLrr(X86_MMX_PMULLW, RS, RD)
#define MMX_PMULLWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PMULLW, MD, MB, MI, MS, RD)
#define MMX_PMULUDQrr(RS, RD)       _MMXLrr(X86_MMX_PMULUDQ, RS, RD)
#define MMX_PMULUDQmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PMULUDQ, MD, MB, MI, MS, RD)
#define MMX_PORrr(RS, RD)       _MMXLrr(X86_MMX_POR, RS, RD)
#define MMX_PORmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_POR, MD, MB, MI, MS, RD)
#define MMX_PSADBWrr(RS, RD)        _MMXLrr(X86_MMX_PSADBW, RS, RD)
#define MMX_PSADBWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PSADBW, MD, MB, MI, MS, RD)
#define MMX_PSLLWir(IM, RD)     __MMXLirr(X86_MMX_PSLLWi, IM, RD, _rM, _b110, _rN)
#define MMX_PSLLWrr(RS, RD)     _MMXLrr(X86_MMX_PSLLW, RS, RD)
#define MMX_PSLLWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSLLW, MD, MB, MI, MS, RD)
#define MMX_PSLLDir(IM, RD)     __MMXLirr(X86_MMX_PSLLDi, IM, RD, _rM, _b110, _rN)
#define MMX_PSLLDrr(RS, RD)     _MMXLrr(X86_MMX_PSLLD, RS, RD)
#define MMX_PSLLDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSLLD, MD, MB, MI, MS, RD)
#define MMX_PSLLQir(IM, RD)     __MMXLirr(X86_MMX_PSLLQi, IM, RD, _rM, _b110, _rN)
#define MMX_PSLLQrr(RS, RD)     _MMXLrr(X86_MMX_PSLLQ, RS, RD)
#define MMX_PSLLQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSLLQ, MD, MB, MI, MS, RD)
#define MMX_PSRAWir(IM, RD)     __MMXLirr(X86_MMX_PSRAWi, IM, RD, _rM, _b100, _rN)
#define MMX_PSRAWrr(RS, RD)     _MMXLrr(X86_MMX_PSRAW, RS, RD)
#define MMX_PSRAWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSRAW, MD, MB, MI, MS, RD)
#define MMX_PSRADir(IM, RD)     __MMXLirr(X86_MMX_PSRADi, IM, RD, _rM, _b100, _rN)
#define MMX_PSRADrr(RS, RD)     _MMXLrr(X86_MMX_PSRAD, RS, RD)
#define MMX_PSRADmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSRAD, MD, MB, MI, MS, RD)
#define MMX_PSRLWir(IM, RD)     __MMXLirr(X86_MMX_PSRLWi, IM, RD, _rM, _b010, _rN)
#define MMX_PSRLWrr(RS, RD)     _MMXLrr(X86_MMX_PSRLW, RS, RD)
#define MMX_PSRLWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSRLW, MD, MB, MI, MS, RD)
#define MMX_PSRLDir(IM, RD)     __MMXLirr(X86_MMX_PSRLDi, IM, RD, _rM, _b010, _rN)
#define MMX_PSRLDrr(RS, RD)     _MMXLrr(X86_MMX_PSRLD, RS, RD)
#define MMX_PSRLDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSRLD, MD, MB, MI, MS, RD)
#define MMX_PSRLQir(IM, RD)     __MMXLirr(X86_MMX_PSRLQi, IM, RD, _rM, _b010, _rN)
#define MMX_PSRLQrr(RS, RD)     _MMXLrr(X86_MMX_PSRLQ, RS, RD)
#define MMX_PSRLQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSRLQ, MD, MB, MI, MS, RD)
#define MMX_PSUBBrr(RS, RD)     _MMXLrr(X86_MMX_PSUBB, RS, RD)
#define MMX_PSUBBmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSUBB, MD, MB, MI, MS, RD)
#define MMX_PSUBWrr(RS, RD)     _MMXLrr(X86_MMX_PSUBW, RS, RD)
#define MMX_PSUBWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSUBW, MD, MB, MI, MS, RD)
#define MMX_PSUBDrr(RS, RD)     _MMXLrr(X86_MMX_PSUBD, RS, RD)
#define MMX_PSUBDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSUBD, MD, MB, MI, MS, RD)
#define MMX_PSUBQrr(RS, RD)     _MMXLrr(X86_MMX_PSUBQ, RS, RD)
#define MMX_PSUBQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PSUBQ, MD, MB, MI, MS, RD)
#define MMX_PSUBSBrr(RS, RD)        _MMXLrr(X86_MMX_PSUBSB, RS, RD)
#define MMX_PSUBSBmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PSUBSB, MD, MB, MI, MS, RD)
#define MMX_PSUBSWrr(RS, RD)        _MMXLrr(X86_MMX_PSUBSW, RS, RD)
#define MMX_PSUBSWmr(MD, MB, MI, MS, RD)    _MMXLmr(X86_MMX_PSUBSW, MD, MB, MI, MS, RD)
#define MMX_PSUBUSBrr(RS, RD)       _MMXLrr(X86_MMX_PSUBUSB, RS, RD)
#define MMX_PSUBUSBmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PSUBUSB, MD, MB, MI, MS, RD)
#define MMX_PSUBUSWrr(RS, RD)       _MMXLrr(X86_MMX_PSUBUSW, RS, RD)
#define MMX_PSUBUSWmr(MD, MB, MI, MS, RD)   _MMXLmr(X86_MMX_PSUBUSW, MD, MB, MI, MS, RD)
#define MMX_PUNPCKHBWrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKHBW, RS, RD)
#define MMX_PUNPCKHBWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKHBW, MD, MB, MI, MS, RD)
#define MMX_PUNPCKHWDrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKHWD, RS, RD)
#define MMX_PUNPCKHWDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKHWD, MD, MB, MI, MS, RD)
#define MMX_PUNPCKHDQrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKHDQ, RS, RD)
#define MMX_PUNPCKHDQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKHDQ, MD, MB, MI, MS, RD)
#define MMX_PUNPCKLBWrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKLBW, RS, RD)
#define MMX_PUNPCKLBWmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKLBW, MD, MB, MI, MS, RD)
#define MMX_PUNPCKLWDrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKLWD, RS, RD)
#define MMX_PUNPCKLWDmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKLWD, MD, MB, MI, MS, RD)
#define MMX_PUNPCKLDQrr(RS, RD)     _MMXLrr(X86_MMX_PUNPCKLDQ, RS, RD)
#define MMX_PUNPCKLDQmr(MD, MB, MI, MS, RD) _MMXLmr(X86_MMX_PUNPCKLDQ, MD, MB, MI, MS, RD)
#define MMX_PXORrr(RS, RD)      _MMXLrr(X86_MMX_PXOR, RS, RD)
#define MMX_PXORmr(MD, MB, MI, MS, RD)  _MMXLmr(X86_MMX_PXOR, MD, MB, MI, MS, RD)

#define MMX_PSHUFWirr(IM, RS, RD)       __MMXLirr(X86_MMX_PSHUFW, IM, RS, _rM, RD, _rM)
#define MMX_PSHUFWimr(IM, MD, MB, MI, MS, RD)   __MMXLimr(X86_MMX_PSHUFW, IM, MD, MB, MI, MS, RD, _rM)
#define MMX_PEXTRWLirr(IM, RS, RD)      __MMXLirr(X86_MMX_PEXTRW, IM, RS, _rM, RD, _r4)
#define MMX_PEXTRWQirr(IM, RS, RD)      __MMXQirr(X86_MMX_PEXTRW, IM, RS, _rM, RD, _r8)
#define MMX_PINSRWLirr(IM, RS, RD)      __MMXLirr(X86_MMX_PINSRW, IM, RS, _r4, RD, _rM)
#define MMX_PINSRWLimr(IM, MD, MB, MI, MS, RD)  __MMXLimr(X86_MMX_PINSRW, IM, MD, MB, MI, MS, RD, _r4)
#define MMX_PINSRWQirr(IM, RS, RD)      __MMXQirr(X86_MMX_PINSRW, IM, RS, _r4, RD, _rM)
#define MMX_PINSRWQimr(IM, MD, MB, MI, MS, RD)  __MMXQimr(X86_MMX_PINSRW, IM, MD, MB, MI, MS, RD, _r8)

// Additionnal MMX instructions, brought by SSSE3 ISA
#define MMX_PABSBrr(RS, RD)     _2P_MMXLrr(X86_MMX_PABSB, RS, RD)
#define MMX_PABSBmr(MD, MB, MI, MS, RD) _2P_MMXLmr(X86_MMX_PABSB, MD, MB, MI, MS, RD)
#define MMX_PABSWrr(RS, RD)     _2P_MMXLrr(X86_MMX_PABSW, RS, RD)
#define MMX_PABSWmr(MD, MB, MI, MS, RD) _2P_MMXLmr(X86_MMX_PABSW, MD, MB, MI, MS, RD)
#define MMX_PABSDrr(RS, RD)     _2P_MMXLrr(X86_MMX_PABSD, RS, RD)
#define MMX_PABSDmr(MD, MB, MI, MS, RD) _2P_MMXLmr(X86_MMX_PABSD, MD, MB, MI, MS, RD)
#define MMX_PHADDWrr(RS, RD)        _2P_MMXLrr(X86_MMX_PHADDW, RS, RD)
#define MMX_PHADDWmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PHADDW, MD, MB, MI, MS, RD)
#define MMX_PHADDDrr(RS, RD)        _2P_MMXLrr(X86_MMX_PHADDD, RS, RD)
#define MMX_PHADDDmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PHADDD, MD, MB, MI, MS, RD)
#define MMX_PHADDSWrr(RS, RD)       _2P_MMXLrr(X86_MMX_PHADDSW, RS, RD)
#define MMX_PHADDSWmr(MD, MB, MI, MS, RD)   _2P_MMXLmr(X86_MMX_PHADDSW, MD, MB, MI, MS, RD)
#define MMX_PHSUBWrr(RS, RD)        _2P_MMXLrr(X86_MMX_PHSUBW, RS, RD)
#define MMX_PHSUBWmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PHSUBW, MD, MB, MI, MS, RD)
#define MMX_PHSUBDrr(RS, RD)        _2P_MMXLrr(X86_MMX_PHSUBD, RS, RD)
#define MMX_PHSUBDmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PHSUBD, MD, MB, MI, MS, RD)
#define MMX_PHSUBSWrr(RS, RD)       _2P_MMXLrr(X86_MMX_PHSUBSW, RS, RD)
#define MMX_PHSUBSWmr(MD, MB, MI, MS, RD)   _2P_MMXLmr(X86_MMX_PHSUBSW, MD, MB, MI, MS, RD)
#define MMX_PMADDUBSWrr(RS, RD)     _2P_MMXLrr(X86_MMX_PMADDUBSW, RS, RD)
#define MMX_PMADDUBSWmr(MD, MB, MI, MS, RD) _2P_MMXLmr(X86_MMX_PMADDUBSW, MD, MB, MI, MS, RD)
#define MMX_PMULHRSWrr(RS, RD)      _2P_MMXLrr(X86_MMX_PMULHRSW, RS, RD)
#define MMX_PMULHRSWmr(MD, MB, MI, MS, RD)  _2P_MMXLmr(X86_MMX_PMULHRSW, MD, MB, MI, MS, RD)
#define MMX_PSHUFBrr(RS, RD)        _2P_MMXLrr(X86_MMX_PSHUFB, RS, RD)
#define MMX_PSHUFBmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PSHUFB, MD, MB, MI, MS, RD)
#define MMX_PSIGNBrr(RS, RD)        _2P_MMXLrr(X86_MMX_PSIGNB, RS, RD)
#define MMX_PSIGNBmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PSIGNB, MD, MB, MI, MS, RD)
#define MMX_PSIGNWrr(RS, RD)        _2P_MMXLrr(X86_MMX_PSIGNW, RS, RD)
#define MMX_PSIGNWmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PSIGNW, MD, MB, MI, MS, RD)
#define MMX_PSIGNDrr(RS, RD)        _2P_MMXLrr(X86_MMX_PSIGND, RS, RD)
#define MMX_PSIGNDmr(MD, MB, MI, MS, RD)    _2P_MMXLmr(X86_MMX_PSIGND, MD, MB, MI, MS, RD)

#define EMMS()                              _OO(0x0f77)


/* --- Media 128-bit instructions ------------------------------------------ */

enum
{
    X86_SSE_CC_EQ = 0,
    X86_SSE_CC_LT = 1,
    X86_SSE_CC_GT = 1,
    X86_SSE_CC_LE = 2,
    X86_SSE_CC_GE = 2,
    X86_SSE_CC_U = 3,
    X86_SSE_CC_NEQ = 4,
    X86_SSE_CC_NLT = 5,
    X86_SSE_CC_NGT = 5,
    X86_SSE_CC_NLE = 6,
    X86_SSE_CC_NGE = 6,
    X86_SSE_CC_O = 7
};

enum
{
    X86_SSE_UCOMI = 0x2e,
    X86_SSE_COMI = 0x2f,
    X86_SSE_CMP = 0xc2,
    X86_SSE_SQRT = 0x51,
    X86_SSE_RSQRT = 0x52,
    X86_SSE_RCP = 0x53,
    X86_SSE_AND = 0x54,
    X86_SSE_ANDN = 0x55,
    X86_SSE_OR = 0x56,
    X86_SSE_XOR = 0x57,
    X86_SSE_ADD = 0x58,
    X86_SSE_MUL = 0x59,
    X86_SSE_SUB = 0x5c,
    X86_SSE_MIN = 0x5d,
    X86_SSE_DIV = 0x5e,
    X86_SSE_MAX = 0x5f,
    X86_SSE_CVTDQ2PD = 0xe6,
    X86_SSE_CVTDQ2PS = 0x5b,
    X86_SSE_CVTPD2DQ = 0xe6,
    X86_SSE_CVTPD2PI = 0x2d,
    X86_SSE_CVTPD2PS = 0x5a,
    X86_SSE_CVTPI2PD = 0x2a,
    X86_SSE_CVTPI2PS = 0x2a,
    X86_SSE_CVTPS2DQ = 0x5b,
    X86_SSE_CVTPS2PD = 0x5a,
    X86_SSE_CVTPS2PI = 0x2d,
    X86_SSE_CVTSD2SI = 0x2d,
    X86_SSE_CVTSD2SS = 0x5a,
    X86_SSE_CVTSI2SD = 0x2a,
    X86_SSE_CVTSI2SS = 0x2a,
    X86_SSE_CVTSS2SD = 0x5a,
    X86_SSE_CVTSS2SI = 0x2d,
    X86_SSE_CVTTPD2PI = 0x2c,
    X86_SSE_CVTTPD2DQ = 0xe6,
    X86_SSE_CVTTPS2DQ = 0x5b,
    X86_SSE_CVTTPS2PI = 0x2c,
    X86_SSE_CVTTSD2SI = 0x2c,
    X86_SSE_CVTTSS2SI = 0x2c,
    X86_SSE_MOVMSK = 0x50,
    X86_SSE_PACKSSDW = 0x6b,
    X86_SSE_PACKSSWB = 0x63,
    X86_SSE_PACKUSWB = 0x67,
    X86_SSE_PADDB = 0xfc,
    X86_SSE_PADDD = 0xfe,
    X86_SSE_PADDQ = 0xd4,
    X86_SSE_PADDSB = 0xec,
    X86_SSE_PADDSW = 0xed,
    X86_SSE_PADDUSB = 0xdc,
    X86_SSE_PADDUSW = 0xdd,
    X86_SSE_PADDW = 0xfd,
    X86_SSE_PAND = 0xdb,
    X86_SSE_PANDN = 0xdf,
    X86_SSE_PAVGB = 0xe0,
    X86_SSE_PAVGW = 0xe3,
    X86_SSE_PCMPEQB = 0x74,
    X86_SSE_PCMPEQD = 0x76,
    X86_SSE_PCMPEQW = 0x75,
    X86_SSE_PCMPGTB = 0x64,
    X86_SSE_PCMPGTD = 0x66,
    X86_SSE_PCMPGTW = 0x65,
    X86_SSE_PMADDWD = 0xf5,
    X86_SSE_PMAXSW = 0xee,
    X86_SSE_PMAXUB = 0xde,
    X86_SSE_PMINSW = 0xea,
    X86_SSE_PMINUB = 0xda,
    X86_SSE_PMOVMSKB = 0xd7,
    X86_SSE_PMULHUW = 0xe4,
    X86_SSE_PMULHW = 0xe5,
    X86_SSE_PMULLW = 0xd5,
    X86_SSE_PMULUDQ = 0xf4,
    X86_SSE_POR = 0xeb,
    X86_SSE_PSADBW = 0xf6,
    X86_SSE_PSLLD = 0xf2,
    X86_SSE_PSLLQ = 0xf3,
    X86_SSE_PSLLW = 0xf1,
    X86_SSE_PSRAD = 0xe2,
    X86_SSE_PSRAW = 0xe1,
    X86_SSE_PSRLD = 0xd2,
    X86_SSE_PSRLQ = 0xd3,
    X86_SSE_PSRLW = 0xd1,
    X86_SSE_PSUBB = 0xf8,
    X86_SSE_PSUBD = 0xfa,
    X86_SSE_PSUBQ = 0xfb,
    X86_SSE_PSUBSB = 0xe8,
    X86_SSE_PSUBSW = 0xe9,
    X86_SSE_PSUBUSB = 0xd8,
    X86_SSE_PSUBUSW = 0xd9,
    X86_SSE_PSUBW = 0xf9,
    X86_SSE_PUNPCKHBW = 0x68,
    X86_SSE_PUNPCKHDQ = 0x6a,
    X86_SSE_PUNPCKHQDQ = 0x6d,
    X86_SSE_PUNPCKHWD = 0x69,
    X86_SSE_PUNPCKLBW = 0x60,
    X86_SSE_PUNPCKLDQ = 0x62,
    X86_SSE_PUNPCKLQDQ = 0x6c,
    X86_SSE_PUNPCKLWD = 0x61,
    X86_SSE_PXOR = 0xef,
    X86_SSSE3_PSHUFB = 0x00,
};

/*									_format		Opcd		,Mod ,r	     ,m		,mem=dsp+sib	,imm... */

#define _SSSE3Lrr(OP1, OP2, RS, RSA, RD, RDA)    (_B(0x66), _REXLrr(RD, RD), _B(0x0f), _OO_Mrm(((OP1) << 8) | (OP2), _b11, RDA(RD), RSA(RS)))
#define _SSSE3Lmr(OP1, OP2, MD, MB, MI, MS, RD, RDA)   (_B(0x66), _REXLmr(MB, MI, RD), _B(0x0f), _OO_r_X(((OP1) << 8) | (OP2), RDA(RD), MD, MB, MI, MS))
#define _SSSE3Lirr(OP1, OP2, IM, RS, RD)        (_B(0x66), _REXLrr(RD, RS), _B(0x0f), _OO_Mrm_B(((OP1) << 8) | (OP2), _b11, _rX(RD), _rX(RS), _u8(IM)))
#define _SSSE3Limr(OP1, OP2, IM, MD, MB, MI, MS, RD)   (_B(0x66), _REXLmr(MB, MI, RD), _B(0x0f), _OO_r_X_B(((OP1) << 8) | (OP2), _rX(RD), MD, MB, MI, MS, _u8(IM)))

#define __SSELir(OP, MO, IM, RD)       (_REXLrr(0, RD), _OO_Mrm_B(0x0f00 | (OP), _b11, MO, _rX(RD), _u8(IM)))
#define __SSELim(OP, MO, IM, MD, MB, MI, MS)  (_REXLrm(0, MB, MI), _OO_r_X_B(0x0f00 | (OP), MO, MD, MB, MI, MS, _u8(IM)))
#define __SSELrr(OP, RS, RSA, RD, RDA)  (_REXLrr(RD, RS), _OO_Mrm(0x0f00 | (OP), _b11, RDA(RD), RSA(RS)))
#define __SSELmr(OP, MD, MB, MI, MS, RD, RDA) (_REXLmr(MB, MI, RD), _OO_r_X(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS))
#define __SSELrm(OP, RS, RSA, MD, MB, MI, MS) (_REXLrm(RS, MB, MI), _OO_r_X(0x0f00 | (OP), RSA(RS), MD, MB, MI, MS))
#define __SSELirr(OP, IM, RS, RD)      (_REXLrr(RD, RS), _OO_Mrm_B(0x0f00 | (OP), _b11, _rX(RD), _rX(RS), _u8(IM)))
#define __SSELimr(OP, IM, MD, MB, MI, MS, RD) (_REXLmr(MB, MI, RD), _OO_r_X_B(0x0f00 | (OP), _rX(RD), MD, MB, MI, MS, _u8(IM)))

#define __SSEQrr(OP, RS, RSA, RD, RDA)  (_REXQrr(RD, RS), _OO_Mrm(0x0f00 | (OP), _b11, RDA(RD), RSA(RS)))
#define __SSEQmr(OP, MD, MB, MI, MS, RD, RDA) (_REXQmr(MB, MI, RD), _OO_r_X(0x0f00 | (OP), RDA(RD), MD, MB, MI, MS))
#define __SSEQrm(OP, RS, RSA, MD, MB, MI, MS) (_REXQrm(RS, MB, MI), _OO_r_X(0x0f00 | (OP), RSA(RS), MD, MB, MI, MS))

#define _SSELrr(PX, OP, RS, RSA, RD, RDA)                    (_B(PX), __SSELrr(OP, RS, RSA, RD, RDA))
#define _SSELmr(PX, OP, MD, MB, MI, MS, RD, RDA)               (_B(PX), __SSELmr(OP, MD, MB, MI, MS, RD, RDA))
#define _SSELrm(PX, OP, RS, RSA, MD, MB, MI, MS)               (_B(PX), __SSELrm(OP, RS, RSA, MD, MB, MI, MS))
#define _SSELir(PX, OP, MO, IM, RD)                     (_B(PX), __SSELir(OP, MO, IM, RD))
#define _SSELim(PX, OP, MO, IM, MD, MB, MI, MS)                (_B(PX), __SSELim(OP, MO, IM, MD, MB, MI, MS))
#define _SSELirr(PX, OP, IM, RS, RD)                    (_B(PX), __SSELirr(OP, IM, RS, RD))
#define _SSELimr(PX, OP, IM, MD, MB, MI, MS, RD)               (_B(PX), __SSELimr(OP, IM, MD, MB, MI, MS, RD))

#define _SSEQrr(PX, OP, RS, RSA, RD, RDA)                    (_B(PX), __SSEQrr(OP, RS, RSA, RD, RDA))
#define _SSEQmr(PX, OP, MD, MB, MI, MS, RD, RDA)               (_B(PX), __SSEQmr(OP, MD, MB, MI, MS, RD, RDA))
#define _SSEQrm(PX, OP, RS, RSA, MD, MB, MI, MS)               (_B(PX), __SSEQrm(OP, RS, RSA, MD, MB, MI, MS))

#define _SSEPSrr(OP, RS, RD)      __SSELrr(OP, RS, _rX, RD, _rX)
#define _SSEPSmr(OP, MD, MB, MI, MS, RD) __SSELmr(OP, MD, MB, MI, MS, RD, _rX)
#define _SSEPSrm(OP, RS, MD, MB, MI, MS) __SSELrm(OP, RS, _rX, MD, MB, MI, MS)
#define _SSEPSirr(OP, IM, RS, RD)      __SSELirr(OP, IM, RS, RD)
#define _SSEPSimr(OP, IM, MD, MB, MI, MS, RD) __SSELimr(OP, IM, MD, MB, MI, MS, RD)

#define _SSEPDrr(OP, RS, RD)       _SSELrr(0x66, OP, RS, _rX, RD, _rX)
#define _SSEPDmr(OP, MD, MB, MI, MS, RD)  _SSELmr(0x66, OP, MD, MB, MI, MS, RD, _rX)
#define _SSEPDrm(OP, RS, MD, MB, MI, MS)  _SSELrm(0x66, OP, RS, _rX, MD, MB, MI, MS)
#define _SSEPDirr(OP, IM, RS, RD)       _SSELirr(0x66, OP, IM, RS, RD)
#define _SSEPDimr(OP, IM, MD, MB, MI, MS, RD)  _SSELimr(0x66, OP, IM, MD, MB, MI, MS, RD)

#define _SSESSrr(OP, RS, RD)       _SSELrr(0xf3, OP, RS, _rX, RD, _rX)
#define _SSESSmr(OP, MD, MB, MI, MS, RD)  _SSELmr(0xf3, OP, MD, MB, MI, MS, RD, _rX)
#define _SSESSrm(OP, RS, MD, MB, MI, MS)  _SSELrm(0xf3, OP, RS, _rX, MD, MB, MI, MS)
#define _SSESSirr(OP, IM, RS, RD)       _SSELirr(0xf3, OP, IM, RS, RD)
#define _SSESSimr(OP, IM, MD, MB, MI, MS, RD)  _SSELimr(0xf3, OP, IM, MD, MB, MI, MS, RD)

#define _SSESDrr(OP, RS, RD)       _SSELrr(0xf2, OP, RS, _rX, RD, _rX)
#define _SSESDmr(OP, MD, MB, MI, MS, RD)  _SSELmr(0xf2, OP, MD, MB, MI, MS, RD, _rX)
#define _SSESDrm(OP, RS, MD, MB, MI, MS)  _SSELrm(0xf2, OP, RS, _rX, MD, MB, MI, MS)
#define _SSESDirr(OP, IM, RS, RD)       _SSELirr(0xf2, OP, IM, RS, RD)
#define _SSESDimr(OP, IM, MD, MB, MI, MS, RD)  _SSELimr(0xf2, OP, IM, MD, MB, MI, MS, RD)

#define ADDPSrr(RS, RD)         _SSEPSrr(X86_SSE_ADD, RS, RD)
#define ADDPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_ADD, MD, MB, MI, MS, RD)
#define ADDPDrr(RS, RD)         _SSEPDrr(X86_SSE_ADD, RS, RD)
#define ADDPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_ADD, MD, MB, MI, MS, RD)

#define ADDSSrr(RS, RD)         _SSESSrr(X86_SSE_ADD, RS, RD)
#define ADDSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_ADD, MD, MB, MI, MS, RD)
#define ADDSDrr(RS, RD)         _SSESDrr(X86_SSE_ADD, RS, RD)
#define ADDSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_ADD, MD, MB, MI, MS, RD)

#define ANDNPSrr(RS, RD)        _SSEPSrr(X86_SSE_ANDN, RS, RD)
#define ANDNPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_ANDN, MD, MB, MI, MS, RD)
#define ANDNPDrr(RS, RD)        _SSEPDrr(X86_SSE_ANDN, RS, RD)
#define ANDNPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_ANDN, MD, MB, MI, MS, RD)

#define ANDPSrr(RS, RD)         _SSEPSrr(X86_SSE_AND, RS, RD)
#define ANDPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_AND, MD, MB, MI, MS, RD)
#define ANDPDrr(RS, RD)         _SSEPDrr(X86_SSE_AND, RS, RD)
#define ANDPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_AND, MD, MB, MI, MS, RD)

#define CMPPSrr(IM, RS, RD)     _SSEPSirr(X86_SSE_CMP, IM, RS, RD)
#define CMPPSmr(IM, MD, MB, MI, MS, RD) _SSEPSimr(X86_SSE_CMP, IM, MD, MB, MI, MS, RD)
#define CMPPDrr(IM, RS, RD)     _SSEPDirr(X86_SSE_CMP, IM, RS, RD)
#define CMPPDmr(IM, MD, MB, MI, MS, RD) _SSEPDimr(X86_SSE_CMP, IM, MD, MB, MI, MS, RD)

#define CMPSSrr(IM, RS, RD)     _SSESSirr(X86_SSE_CMP, IM, RS, RD)
#define CMPSSmr(IM, MD, MB, MI, MS, RD) _SSESSimr(X86_SSE_CMP, IM, MD, MB, MI, MS, RD)
#define CMPSDrr(IM, RS, RD)     _SSESDirr(X86_SSE_CMP, IM, RS, RD)
#define CMPSDmr(IM, MD, MB, MI, MS, RD) _SSESDimr(X86_SSE_CMP, IM, MD, MB, MI, MS, RD)

#define DIVPSrr(RS, RD)         _SSEPSrr(X86_SSE_DIV, RS, RD)
#define DIVPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_DIV, MD, MB, MI, MS, RD)
#define DIVPDrr(RS, RD)         _SSEPDrr(X86_SSE_DIV, RS, RD)
#define DIVPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_DIV, MD, MB, MI, MS, RD)

#define DIVSSrr(RS, RD)         _SSESSrr(X86_SSE_DIV, RS, RD)
#define DIVSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_DIV, MD, MB, MI, MS, RD)
#define DIVSDrr(RS, RD)         _SSESDrr(X86_SSE_DIV, RS, RD)
#define DIVSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_DIV, MD, MB, MI, MS, RD)

#define MAXPSrr(RS, RD)         _SSEPSrr(X86_SSE_MAX, RS, RD)
#define MAXPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MAX, MD, MB, MI, MS, RD)
#define MAXPDrr(RS, RD)         _SSEPDrr(X86_SSE_MAX, RS, RD)
#define MAXPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MAX, MD, MB, MI, MS, RD)

#define MAXSSrr(RS, RD)         _SSESSrr(X86_SSE_MAX, RS, RD)
#define MAXSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MAX, MD, MB, MI, MS, RD)
#define MAXSDrr(RS, RD)         _SSESDrr(X86_SSE_MAX, RS, RD)
#define MAXSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MAX, MD, MB, MI, MS, RD)

#define MINPSrr(RS, RD)         _SSEPSrr(X86_SSE_MIN, RS, RD)
#define MINPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MIN, MD, MB, MI, MS, RD)
#define MINPDrr(RS, RD)         _SSEPDrr(X86_SSE_MIN, RS, RD)
#define MINPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MIN, MD, MB, MI, MS, RD)

#define MINSSrr(RS, RD)         _SSESSrr(X86_SSE_MIN, RS, RD)
#define MINSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MIN, MD, MB, MI, MS, RD)
#define MINSDrr(RS, RD)         _SSESDrr(X86_SSE_MIN, RS, RD)
#define MINSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MIN, MD, MB, MI, MS, RD)

#define MULPSrr(RS, RD)         _SSEPSrr(X86_SSE_MUL, RS, RD)
#define MULPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MUL, MD, MB, MI, MS, RD)
#define MULPDrr(RS, RD)         _SSEPDrr(X86_SSE_MUL, RS, RD)
#define MULPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MUL, MD, MB, MI, MS, RD)

#define MULSSrr(RS, RD)         _SSESSrr(X86_SSE_MUL, RS, RD)
#define MULSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MUL, MD, MB, MI, MS, RD)
#define MULSDrr(RS, RD)         _SSESDrr(X86_SSE_MUL, RS, RD)
#define MULSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MUL, MD, MB, MI, MS, RD)

#define ORPSrr(RS, RD)          _SSEPSrr(X86_SSE_OR, RS, RD)
#define ORPSmr(MD, MB, MI, MS, RD)  _SSEPSmr(X86_SSE_OR, MD, MB, MI, MS, RD)
#define ORPDrr(RS, RD)          _SSEPDrr(X86_SSE_OR, RS, RD)
#define ORPDmr(MD, MB, MI, MS, RD)  _SSEPDmr(X86_SSE_OR, MD, MB, MI, MS, RD)

#define RCPPSrr(RS, RD)         _SSEPSrr(X86_SSE_RCP, RS, RD)
#define RCPPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_RCP, MD, MB, MI, MS, RD)
#define RCPSSrr(RS, RD)         _SSESSrr(X86_SSE_RCP, RS, RD)
#define RCPSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_RCP, MD, MB, MI, MS, RD)

#define RSQRTPSrr(RS, RD)       _SSEPSrr(X86_SSE_RSQRT, RS, RD)
#define RSQRTPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_RSQRT, MD, MB, MI, MS, RD)
#define RSQRTSSrr(RS, RD)       _SSESSrr(X86_SSE_RSQRT, RS, RD)
#define RSQRTSSmr(MD, MB, MI, MS, RD)   _SSESSmr(X86_SSE_RSQRT, MD, MB, MI, MS, RD)

#define SQRTPSrr(RS, RD)        _SSEPSrr(X86_SSE_SQRT, RS, RD)
#define SQRTPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_SQRT, MD, MB, MI, MS, RD)
#define SQRTPDrr(RS, RD)        _SSEPDrr(X86_SSE_SQRT, RS, RD)
#define SQRTPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_SQRT, MD, MB, MI, MS, RD)

#define SQRTSSrr(RS, RD)        _SSESSrr(X86_SSE_SQRT, RS, RD)
#define SQRTSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_SQRT, MD, MB, MI, MS, RD)
#define SQRTSDrr(RS, RD)        _SSESDrr(X86_SSE_SQRT, RS, RD)
#define SQRTSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_SQRT, MD, MB, MI, MS, RD)

#define SUBPSrr(RS, RD)         _SSEPSrr(X86_SSE_SUB, RS, RD)
#define SUBPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_SUB, MD, MB, MI, MS, RD)
#define SUBPDrr(RS, RD)         _SSEPDrr(X86_SSE_SUB, RS, RD)
#define SUBPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_SUB, MD, MB, MI, MS, RD)

#define SUBSSrr(RS, RD)         _SSESSrr(X86_SSE_SUB, RS, RD)
#define SUBSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_SUB, MD, MB, MI, MS, RD)
#define SUBSDrr(RS, RD)         _SSESDrr(X86_SSE_SUB, RS, RD)
#define SUBSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_SUB, MD, MB, MI, MS, RD)

#define XORPSrr(RS, RD)         _SSEPSrr(X86_SSE_XOR, RS, RD)
#define XORPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_XOR, MD, MB, MI, MS, RD)
#define XORPDrr(RS, RD)         _SSEPDrr(X86_SSE_XOR, RS, RD)
#define XORPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_XOR, MD, MB, MI, MS, RD)

#define COMISSrr(RS, RD)        _SSEPSrr(X86_SSE_COMI, RS, RD)
#define COMISSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_COMI, MD, MB, MI, MS, RD)
#define COMISDrr(RS, RD)        _SSEPDrr(X86_SSE_COMI, RS, RD)
#define COMISDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_COMI, MD, MB, MI, MS, RD)

#define UCOMISSrr(RS, RD)       _SSEPSrr(X86_SSE_UCOMI, RS, RD)
#define UCOMISSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_UCOMI, MD, MB, MI, MS, RD)
#define UCOMISDrr(RS, RD)       _SSEPDrr(X86_SSE_UCOMI, RS, RD)
#define UCOMISDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_UCOMI, MD, MB, MI, MS, RD)

#define MOVAPSrr(RS, RD)        _SSEPSrr(0x28, RS, RD)
#define MOVAPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(0x28, MD, MB, MI, MS, RD)
#define MOVAPSrm(RS, MD, MB, MI, MS)    _SSEPSrm(0x29, RS, MD, MB, MI, MS)

#define MOVAPDrr(RS, RD)        _SSEPDrr(0x28, RS, RD)
#define MOVAPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(0x28, MD, MB, MI, MS, RD)
#define MOVAPDrm(RS, MD, MB, MI, MS)    _SSEPDrm(0x29, RS, MD, MB, MI, MS)

#define CVTDQ2PDrr(RS, RD)       _SSELrr(0xf3, X86_SSE_CVTDQ2PD, RS, _rX, RD, _rX)
#define CVTDQ2PDmr(MD, MB, MI, MS, RD)   _SSELmr(0xf3, X86_SSE_CVTDQ2PD, MD, MB, MI, MS, RD, _rX)
#define CVTDQ2PSrr(RS, RD)      __SSELrr(X86_SSE_CVTDQ2PS, RS, _rX, RD, _rX)
#define CVTDQ2PSmr(MD, MB, MI, MS, RD)  __SSELmr(X86_SSE_CVTDQ2PS, MD, MB, MI, MS, RD, _rX)
#define CVTPD2DQrr(RS, RD)       _SSELrr(0xf2, X86_SSE_CVTPD2DQ, RS, _rX, RD, _rX)
#define CVTPD2DQmr(MD, MB, MI, MS, RD)   _SSELmr(0xf2, X86_SSE_CVTPD2DQ, MD, MB, MI, MS, RD, _rX)
#define CVTPD2PIrr(RS, RD)       _SSELrr(0x66, X86_SSE_CVTPD2PI, RS, _rX, RD, _rM)
#define CVTPD2PImr(MD, MB, MI, MS, RD)   _SSELmr(0x66, X86_SSE_CVTPD2PI, MD, MB, MI, MS, RD, _rM)
#define CVTPD2PSrr(RS, RD)       _SSELrr(0x66, X86_SSE_CVTPD2PS, RS, _rX, RD, _rX)
#define CVTPD2PSmr(MD, MB, MI, MS, RD)   _SSELmr(0x66, X86_SSE_CVTPD2PS, MD, MB, MI, MS, RD, _rX)
#define CVTPI2PDrr(RS, RD)       _SSELrr(0x66, X86_SSE_CVTPI2PD, RS, _rM, RD, _rX)
#define CVTPI2PDmr(MD, MB, MI, MS, RD)   _SSELmr(0x66, X86_SSE_CVTPI2PD, MD, MB, MI, MS, RD, _rX)
#define CVTPI2PSrr(RS, RD)      __SSELrr(X86_SSE_CVTPI2PS, RS, _rM, RD, _rX)
#define CVTPI2PSmr(MD, MB, MI, MS, RD)  __SSELmr(X86_SSE_CVTPI2PS, MD, MB, MI, MS, RD, _rX)
#define CVTPS2DQrr(RS, RD)       _SSELrr(0x66, X86_SSE_CVTPS2DQ, RS, _rX, RD, _rX)
#define CVTPS2DQmr(MD, MB, MI, MS, RD)   _SSELmr(0x66, X86_SSE_CVTPS2DQ, MD, MB, MI, MS, RD, _rX)
#define CVTPS2PDrr(RS, RD)      __SSELrr(X86_SSE_CVTPS2PD, RS, _rX, RD, _rX)
#define CVTPS2PDmr(MD, MB, MI, MS, RD)  __SSELmr(X86_SSE_CVTPS2PD, MD, MB, MI, MS, RD, _rX)
#define CVTPS2PIrr(RS, RD)      __SSELrr(X86_SSE_CVTPS2PI, RS, _rX, RD, _rM)
#define CVTPS2PImr(MD, MB, MI, MS, RD)  __SSELmr(X86_SSE_CVTPS2PI, MD, MB, MI, MS, RD, _rM)
#define CVTSD2SILrr(RS, RD)      _SSELrr(0xf2, X86_SSE_CVTSD2SI, RS, _rX, RD, _r4)
#define CVTSD2SILmr(MD, MB, MI, MS, RD)  _SSELmr(0xf2, X86_SSE_CVTSD2SI, MD, MB, MI, MS, RD, _r4)
#define CVTSD2SIQrr(RS, RD)      _SSEQrr(0xf2, X86_SSE_CVTSD2SI, RS, _rX, RD, _r8)
#define CVTSD2SIQmr(MD, MB, MI, MS, RD)  _SSEQmr(0xf2, X86_SSE_CVTSD2SI, MD, MB, MI, MS, RD, _r8)
#define CVTSD2SSrr(RS, RD)       _SSELrr(0xf2, X86_SSE_CVTSD2SS, RS, _rX, RD, _rX)
#define CVTSD2SSmr(MD, MB, MI, MS, RD)   _SSELmr(0xf2, X86_SSE_CVTSD2SS, MD, MB, MI, MS, RD, _rX)
#define CVTSI2SDLrr(RS, RD)      _SSELrr(0xf2, X86_SSE_CVTSI2SD, RS, _r4, RD, _rX)
#define CVTSI2SDLmr(MD, MB, MI, MS, RD)  _SSELmr(0xf2, X86_SSE_CVTSI2SD, MD, MB, MI, MS, RD, _rX)
#define CVTSI2SDQrr(RS, RD)      _SSEQrr(0xf2, X86_SSE_CVTSI2SD, RS, _r8, RD, _rX)
#define CVTSI2SDQmr(MD, MB, MI, MS, RD)  _SSEQmr(0xf2, X86_SSE_CVTSI2SD, MD, MB, MI, MS, RD, _rX)
#define CVTSI2SSLrr(RS, RD)      _SSELrr(0xf3, X86_SSE_CVTSI2SS, RS, _r4, RD, _rX)
#define CVTSI2SSLmr(MD, MB, MI, MS, RD)  _SSELmr(0xf3, X86_SSE_CVTSI2SS, MD, MB, MI, MS, RD, _rX)
#define CVTSI2SSQrr(RS, RD)      _SSEQrr(0xf3, X86_SSE_CVTSI2SS, RS, _r8, RD, _rX)
#define CVTSI2SSQmr(MD, MB, MI, MS, RD)  _SSEQmr(0xf3, X86_SSE_CVTSI2SS, MD, MB, MI, MS, RD, _rX)
#define CVTSS2SDrr(RS, RD)       _SSELrr(0xf3, X86_SSE_CVTSS2SD, RS, _rX, RD, _rX)
#define CVTSS2SDmr(MD, MB, MI, MS, RD)   _SSELmr(0xf3, X86_SSE_CVTSS2SD, MD, MB, MI, MS, RD, _rX)
#define CVTSS2SILrr(RS, RD)      _SSELrr(0xf3, X86_SSE_CVTSS2SI, RS, _rX, RD, _r4)
#define CVTSS2SILmr(MD, MB, MI, MS, RD)  _SSELmr(0xf3, X86_SSE_CVTSS2SI, MD, MB, MI, MS, RD, _r4)
#define CVTSS2SIQrr(RS, RD)      _SSEQrr(0xf3, X86_SSE_CVTSS2SI, RS, _rX, RD, _r8)
#define CVTSS2SIQmr(MD, MB, MI, MS, RD)  _SSEQmr(0xf3, X86_SSE_CVTSS2SI, MD, MB, MI, MS, RD, _r8)
#define CVTTPD2PIrr(RS, RD)      _SSELrr(0x66, X86_SSE_CVTTPD2PI, RS, _rX, RD, _rM)
#define CVTTPD2PImr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTTPD2PI, MD, MB, MI, MS, RD, _rM)
#define CVTTPD2DQrr(RS, RD)      _SSELrr(0x66, X86_SSE_CVTTPD2DQ, RS, _rX, RD, _rX)
#define CVTTPD2DQmr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTTPD2DQ, MD, MB, MI, MS, RD, _rX)
#define CVTTPS2DQrr(RS, RD)      _SSELrr(0xf3, X86_SSE_CVTTPS2DQ, RS, _rX, RD, _rX)
#define CVTTPS2DQmr(MD, MB, MI, MS, RD)  _SSELmr(0xf3, X86_SSE_CVTTPS2DQ, MD, MB, MI, MS, RD, _rX)
#define CVTTPS2PIrr(RS, RD)     __SSELrr(X86_SSE_CVTTPS2PI, RS, _rX, RD, _rM)
#define CVTTPS2PImr(MD, MB, MI, MS, RD) __SSELmr(X86_SSE_CVTTPS2PI, MD, MB, MI, MS, RD, _rM)
#define CVTTSD2SILrr(RS, RD)         _SSELrr(0xf2, X86_SSE_CVTTSD2SI, RS, _rX, RD, _r4)
#define CVTTSD2SILmr(MD, MB, MI, MS, RD) _SSELmr(0xf2, X86_SSE_CVTTSD2SI, MD, MB, MI, MS, RD, _r4)
#define CVTTSD2SIQrr(RS, RD)         _SSEQrr(0xf2, X86_SSE_CVTTSD2SI, RS, _rX, RD, _r8)
#define CVTTSD2SIQmr(MD, MB, MI, MS, RD) _SSEQmr(0xf2, X86_SSE_CVTTSD2SI, MD, MB, MI, MS, RD, _r8)
#define CVTTSS2SILrr(RS, RD)         _SSELrr(0xf3, X86_SSE_CVTTSS2SI, RS, _rX, RD, _r4)
#define CVTTSS2SILmr(MD, MB, MI, MS, RD) _SSELmr(0xf3, X86_SSE_CVTTSS2SI, MD, MB, MI, MS, RD, _r4)
#define CVTTSS2SIQrr(RS, RD)         _SSEQrr(0xf3, X86_SSE_CVTTSS2SI, RS, _rX, RD, _r8)
#define CVTTSS2SIQmr(MD, MB, MI, MS, RD) _SSEQmr(0xf3, X86_SSE_CVTTSS2SI, MD, MB, MI, MS, RD, _r8)

#define MOVDXDrr(RS, RD)         _SSELrr(0x66, 0x6e, RS, _r4, RD, _rX)
#define MOVDXDmr(MD, MB, MI, MS, RD)     _SSELmr(0x66, 0x6e, MD, MB, MI, MS, RD, _rX)
#define MOVQXDrr(RS, RD)         _SSEQrr(0x66, 0x6e, RS, _r8, RD, _rX)
#define MOVQXDmr(MD, MB, MI, MS, RD)     _SSEQmr(0x66, 0x6e, MD, MB, MI, MS, RD, _rX)

#define MOVDXSrr(RS, RD)         _SSELrr(0x66, 0x7e, RD, _r4, RS, _rX)
#define MOVDXSrm(RS, MD, MB, MI, MS)     _SSELrm(0x66, 0x7e, RS, _rX, MD, MB, MI, MS)
#define MOVQXSrr(RS, RD)         _SSEQrr(0x66, 0x7e, RD, _r8, RS, _rX)
#define MOVQXSrm(RS, MD, MB, MI, MS)     _SSEQrm(0x66, 0x7e, RS, _rX, MD, MB, MI, MS)

#define MOVDLMrr(RS, RD)        __SSELrr(0x6e, RS, _r4, RD, _rM)
#define MOVDLMmr(MD, MB, MI, MS, RD)    __SSELmr(0x6e, MD, MB, MI, MS, RD, _rM)
#define MOVDQMrr(RS, RD)        __SSEQrr(0x6e, RS, _r8, RD, _rM)
#define MOVDQMmr(MD, MB, MI, MS, RD)    __SSEQmr(0x6e, MD, MB, MI, MS, RD, _rM)

#define MOVDMLrr(RS, RD)        __SSELrr(0x7e, RS, _rM, RD, _r4)
#define MOVDMLrm(RS, MD, MB, MI, MS)    __SSELrm(0x7e, RS, _rM, MD, MB, MI, MS)
#define MOVDMQrr(RS, RD)        __SSEQrr(0x7e, RS, _rM, RD, _r8)
#define MOVDMQrm(RS, MD, MB, MI, MS)    __SSEQrm(0x7e, RS, _rM, MD, MB, MI, MS)

#define MOVDQ2Qrr(RS, RD)        _SSELrr(0xf2, 0xd6, RS, _rX, RD, _rM)
#define MOVMSKPSrr(RS, RD)      __SSELrr(0x50, RS, _rX, RD, _r4)
#define MOVMSKPDrr(RS, RD)       _SSELrr(0x66, 0x50, RS, _rX, RD, _r4)

#define MOVHLPSrr(RS, RD)       __SSELrr(0x12, RS, _rX, RD, _rX)
#define MOVLHPSrr(RS, RD)       __SSELrr(0x16, RS, _rX, RD, _rX)

#define MOVDQArr(RS, RD)         _SSELrr(0x66, 0x6f, RS, _rX, RD, _rX)
#define MOVDQAmr(MD, MB, MI, MS, RD)     _SSELmr(0x66, 0x6f, MD, MB, MI, MS, RD, _rX)
#define MOVDQArm(RS, MD, MB, MI, MS)     _SSELrm(0x66, 0x7f, RS, _rX, MD, MB, MI, MS)

#define MOVDQUrr(RS, RD)         _SSELrr(0xf3, 0x6f, RS, _rX, RD, _rX)
#define MOVDQUmr(MD, MB, MI, MS, RD)     _SSELmr(0xf3, 0x6f, MD, MB, MI, MS, RD, _rX)
#define MOVDQUrm(RS, MD, MB, MI, MS)     _SSELrm(0xf3, 0x7f, RS, _rX, MD, MB, MI, MS)

#define MOVHPDmr(MD, MB, MI, MS, RD)     _SSELmr(0x66, 0x16, MD, MB, MI, MS, RD, _rX)
#define MOVHPDrm(RS, MD, MB, MI, MS)     _SSELrm(0x66, 0x17, RS, _rX, MD, MB, MI, MS)
#define MOVHPSmr(MD, MB, MI, MS, RD)    __SSELmr(0x16, MD, MB, MI, MS, RD, _rX)
#define MOVHPSrm(RS, MD, MB, MI, MS)    __SSELrm(0x17, RS, _rX, MD, MB, MI, MS)

#define MOVLPDmr(MD, MB, MI, MS, RD)     _SSELmr(0x66, 0x12, MD, MB, MI, MS, RD, _rX)
#define MOVLPDrm(RS, MD, MB, MI, MS)     _SSELrm(0x66, 0x13, RS, _rX, MD, MB, MI, MS)
#define MOVLPSmr(MD, MB, MI, MS, RD)    __SSELmr(0x12, MD, MB, MI, MS, RD, _rX)
#define MOVLPSrm(RS, MD, MB, MI, MS)    __SSELrm(0x13, RS, _rX, MD, MB, MI, MS)


/* --- FLoating-Point instructions ----------------------------------------- */

#define _ESCmi(D, B, I, S, OP)  (_REXLrm(0, B, I), _O_r_X(0xd8 | (OP & 7), (OP >> 3), D, B, I, S))

#define FLDr(R)         _OOr(0xd9c0, _rN(R))
#define FLDLm(D, B, I, S)      _ESCmi(D, B, I, S, 005)
#define FLDSm(D, B, I, S)      _ESCmi(D, B, I, S, 001)
#define FLDTm(D, B, I, S)      _ESCmi(D, B, I, S, 053)

#define FSTr(R)         _OOr(0xddd0, _rN(R))
#define FSTSm(D, B, I, S)      _ESCmi(D, B, I, S, 021)
#define FSTLm(D, B, I, S)      _ESCmi(D, B, I, S, 025)

#define FSTPr(R)        _OOr(0xddd8, _rN(R))
#define FSTPSm(D, B, I, S)     _ESCmi(D, B, I, S, 031)
#define FSTPLm(D, B, I, S)     _ESCmi(D, B, I, S, 035)
#define FSTPTm(D, B, I, S)     _ESCmi(D, B, I, S, 073)

#define FADDr0(R)       _OOr(0xd8c0, _rN(R))
#define FADD0r(R)       _OOr(0xdcc0, _rN(R))
#define FADDP0r(R)      _OOr(0xdec0, _rN(R))
#define FADDSm(D, B, I, S)     _ESCmi(D, B, I, S, 000)
#define FADDLm(D, B, I, S)     _ESCmi(D, B, I, S, 004)

#define FSUBSm(D, B, I, S)     _ESCmi(D, B, I, S, 040)
#define FSUBLm(D, B, I, S)     _ESCmi(D, B, I, S, 044)
#define FSUBr0(R)       _OOr(0xd8e0, _rN(R))
#define FSUB0r(R)       _OOr(0xdce8, _rN(R))
#define FSUBP0r(R)      _OOr(0xdee8, _rN(R))

#define FSUBRr0(R)      _OOr(0xd8e8, _rN(R))
#define FSUBR0r(R)      _OOr(0xdce0, _rN(R))
#define FSUBRP0r(R)     _OOr(0xdee0, _rN(R))
#define FSUBRSm(D, B, I, S)    _ESCmi(D, B, I, S, 050)
#define FSUBRLm(D, B, I, S)    _ESCmi(D, B, I, S, 054)

#define FMULr0(R)       _OOr(0xd8c8, _rN(R))
#define FMUL0r(R)       _OOr(0xdcc8, _rN(R))
#define FMULP0r(R)      _OOr(0xdec8, _rN(R))
#define FMULSm(D, B, I, S)     _ESCmi(D, B, I, S, 010)
#define FMULLm(D, B, I, S)     _ESCmi(D, B, I, S, 014)

#define FDIVr0(R)       _OOr(0xd8f0, _rN(R))
#define FDIV0r(R)       _OOr(0xdcf8, _rN(R))
#define FDIVP0r(R)      _OOr(0xdef8, _rN(R))
#define FDIVSm(D, B, I, S)     _ESCmi(D, B, I, S, 060)
#define FDIVLm(D, B, I, S)     _ESCmi(D, B, I, S, 064)

#define FDIVRr0(R)      _OOr(0xd8f8, _rN(R))
#define FDIVR0r(R)      _OOr(0xdcf0, _rN(R))
#define FDIVRP0r(R)     _OOr(0xdef0, _rN(R))
#define FDIVRSm(D, B, I, S)    _ESCmi(D, B, I, S, 070)
#define FDIVRLm(D, B, I, S)    _ESCmi(D, B, I, S, 074)

#define FCMOVBr0(R)     _OOr(0xdac0, _rN(R))
#define FCMOVBEr0(R)        _OOr(0xdad0, _rN(R))
#define FCMOVEr0(R)     _OOr(0xdac8, _rN(R))
#define FCMOVNBr0(R)        _OOr(0xdbc0, _rN(R))
#define FCMOVNBEr0(R)       _OOr(0xdbd0, _rN(R))
#define FCMOVNEr0(R)        _OOr(0xdbc8, _rN(R))
#define FCMOVNUr0(R)        _OOr(0xdbd8, _rN(R))
#define FCMOVUr0(R)     _OOr(0xdad8, _rN(R))
#define FCOMIr0(R)      _OOr(0xdbf0, _rN(R))
#define FCOMIPr0(R)     _OOr(0xdff0, _rN(R))

#define FCOMr(R)        _OOr(0xd8d0, _rN(R))
#define FCOMSm(D, B, I, S)     _ESCmi(D, B, I, S, 020)
#define FCOMLm(D, B, I, S)     _ESCmi(D, B, I, S, 024)

#define FCOMPr(R)       _OOr(0xd8d8, _rN(R))
#define FCOMPSm(D, B, I, S)    _ESCmi(D, B, I, S, 030)
#define FCOMPLm(D, B, I, S)    _ESCmi(D, B, I, S, 034)

#define FUCOMIr0(R)     _OOr(0xdbe8, _rN(R))
#define FUCOMIPr0(R)        _OOr(0xdfe8, _rN(R))
#define FUCOMPr(R)      _OOr(0xdde8, _rN(R))
#define FUCOMr(R)       _OOr(0xdde0, _rN(R))

#define FIADDLm(D, B, I, S)    _ESCmi(D, B, I, S, 002)
#define FICOMLm(D, B, I, S)    _ESCmi(D, B, I, S, 022)
#define FICOMPLm(D, B, I, S)   _ESCmi(D, B, I, S, 032)
#define FIDIVLm(D, B, I, S)    _ESCmi(D, B, I, S, 062)
#define FIDIVRLm(D, B, I, S)   _ESCmi(D, B, I, S, 072)
#define FILDLm(D, B, I, S)     _ESCmi(D, B, I, S, 003)
#define FILDQm(D, B, I, S)     _ESCmi(D, B, I, S, 057)
#define FIMULLm(D, B, I, S)    _ESCmi(D, B, I, S, 012)
#define FISTLm(D, B, I, S)     _ESCmi(D, B, I, S, 023)
#define FISTPLm(D, B, I, S)    _ESCmi(D, B, I, S, 033)
#define FISTPQm(D, B, I, S)    _ESCmi(D, B, I, S, 077)
#define FISUBLm(D, B, I, S)    _ESCmi(D, B, I, S, 042)
#define FISUBRLm(D, B, I, S)   _ESCmi(D, B, I, S, 052)

#define FREEr(R)        _OOr(0xddc0, _rN(R))
#define FXCHr(R)        _OOr(0xd9c8, _rN(R))

#endif /* X86_RTASM_H */